[{"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n#%%\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n#%%\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n#%%\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n#%%\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):", "original_comment": " # pad zero at the end\n", "target_code": " res_1 = np.concatenate([res_1, np.zeros(len(mix)-len(res_1))])\n", "project_metadata": {"full_name": "naplab/DANet", "description": "Deep Attractor Network (DANet) for single-channel speech separation", "topics": [], "git_url": "git://github.com/naplab/DANet.git", "stars": 53, "watchers": 53, "forks": 15, "created": "2018-09-18T21:26:22Z", "size": 11, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23340, "Python": 4814}, "last_updated": "2020-12-14T07:40:33Z"}, "annotations": [{"completed_by": {"id": 1}, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "res_1 = np.pad(res_1, (0, args.infeat_dim-len(res_1)), 'constant')\nres_2 = np.pad(res_2, (0, args.infeat_dim-len(res_2)), 'constant')\n", "model": "docstring", "intent": " # pad zero at the end"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n#%%\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n#%%\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n#%%\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n#%%\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n#%%\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)", "original_comment": "# Reshape validation data so that model can be run\n", "target_code": "X_valid = X_valid.values.reshape(-1, 6)\n", "project_metadata": {"full_name": "abhishek3aj/ML1819--task-101--team-06", "description": "ML framework comparison", "topics": [], "git_url": "git://github.com/abhishek3aj/ML1819--task-101--team-06.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-10-09T09:48:20Z", "size": 21107, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4638466, "Python": 84406}, "last_updated": "2018-12-17T19:27:23Z"}, "annotations": [{"completed_by": {"id": 1}, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3, "usefulness": "Strongly disagree", "usefulness-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "y_valid_pred = lrm.predict(X_valid)\ny_test_pred = lrm.predict(X_test)\ny_test_pred_prob = lrm.predict_proba(X_test)[:, 1]\nprint('Accuracy: ', accuracy_score(y_valid, y_valid_pred))\nprint('Precision: ', precision_score(y_valid, y_valid_pred))\nprint('Recall: ', recall_score(y_valid, y_valid_pred))\nprint('F1 Score: ', f1_score(y_valid, y_valid_pred))\nprint('Confusion\n", "model": "natural", "intent": "# Reshape validation data"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)", "original_comment": "# Add the title\n", "target_code": "plt.title('Fuel efficiency vs Horse-power')\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Strongly disagree", "precision-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "df.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n", "model": "no-comments", "intent": "# Add the title"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n#%%\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n#%%\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n#%%\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n#%%\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n#%%\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n#%%\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n#%%\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n#%%\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n#%%\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n#%%\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n#%%\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n#%%\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n#%%\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n#%%\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n#%%\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n#%%\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n#%%\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n#%%\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n#%%\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n#%%\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n#%%\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n#%%\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n#%%\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n#%%\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n#%%\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])", "original_comment": "# Annotate plots\n", "target_code": "subplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\n", "project_metadata": {"full_name": "jpowie01/CIFAR10-HandsOn", "description": "Hands-on prepared for one of my presentations that took place on Computer Vision's mini-course at student's orgranization called \"Gradient\" (Gda\u0144sk University of Technology)", "topics": ["deep-learning", "convolutional-neural-networks", "cifar10", "jupyter-notebook", "hands-on"], "git_url": "git://github.com/jpowie01/CIFAR10-HandsOn.git", "stars": 6, "watchers": 6, "forks": 0, "created": "2018-01-03T21:22:35Z", "size": 9589, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1717141}, "last_updated": "2018-01-09T19:26:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "\n", "model": "no-comments", "intent": "# Annotate plots"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n#%%\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n#%%\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n#%%\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n#%%\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n#%%\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n#%%", "original_comment": "# Show the top 10 rows of the sample\n", "target_code": "sample_df.head()\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "compatibility": "Disagree", "compatibility-score": 1, "precision": "Strongly agree", "precision-score": 3}], "predicted_code": "flights_df.head(10)\n", "model": "natural", "intent": "# Show the top 10 rows of the sample"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n#%%\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n#%%\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n#%%\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n#%%\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n#%%\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n#%%", "original_comment": "# Show the top 10 rows of the sample\n", "target_code": "sample_df.head()\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "compatibility": "Disagree", "compatibility-score": 1, "precision": "Strongly agree", "precision-score": 3}], "predicted_code": "flights_df.head(5)\n", "model": "no-comments", "intent": "# Show the top 10 rows of the sample"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)\n\n\n\nDATE = '04202020'\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)", "original_comment": "# ## Read in information\n", "target_code": "PLdb = pd.read_csv(\n '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))\n", "project_metadata": {"full_name": "MrOlm/covid19_population_genomics", "description": "Analysis of the population diversity of SARS-CoV-2 within and between individual patients", "topics": [], "git_url": "git://github.com/MrOlm/covid19_population_genomics.git", "stars": 9, "watchers": 9, "forks": 1, "created": "2020-03-20T16:01:19Z", "size": 170583, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 40959012, "Python": 1028}, "last_updated": "2020-12-05T12:24:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "get_ipython().run_cell_magic('bash', '',\n 'if [! -d /cellar/users/btsui/Data/nrnb01_nobackup/METAMAP/ ] ; then\\n wget -O /cellar/users/btsui/Data/nrnb01_nobackup/METAMAP/allSRS.pickle.gz http://cf.10xgenomics.com/samples/cell-exp/2.1.0/samples.pickle.gz\\nfi')\n", "model": "no-comments", "intent": "# Read in information"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n\ndata.head()\n# look at the dataset\n\n\ndata.info()\n# basic info of dataset\n\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n\ndata.describe()\n\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n\ndata.columns\n\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n\n# Data Visulaization\n\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n\nsns.countplot(data['room_type'])\n\n\n# We can observe reduced preference in shared rooms\n\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n\n# heavily skewed\n\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n\ndata.corr()\n\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n\n# to plot locaation and price on NY city map\n\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)\nplt.figure(figsize=(12, 12), facecolor='black')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n#%%\n\ndata.head()\n# look at the dataset\n\n#%%\n\ndata.info()\n# basic info of dataset\n\n#%%\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n#%%\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n#%%\n\ndata.describe()\n\n#%%\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n#%%\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n#%%\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n#%%\n\ndata.columns\n\n#%%\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n#%%\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n#%%\n\n# Data Visulaization\n\n#%%\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n#%%\n\nsns.countplot(data['room_type'])\n\n#%%\n\n# We can observe reduced preference in shared rooms\n\n#%%\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n#%%\n\n# heavily skewed\n\n#%%\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n#%%\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n#%%\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n#%%\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n#%%\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n#%%\n\ndata.corr()\n\n#%%\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n#%%\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n#%%\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n#%%\n\n# to plot locaation and price on NY city map\n\n#%%\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n#%%\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n#%%\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n#%%\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n#%%\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)", "original_comment": "# Plot the counts\n", "target_code": "ax = plt.subplot(1, 1, 1)\nplt.imshow(img, 'hot')\nplt.axis('off')\nplt.tight_layout()\n", "project_metadata": {"full_name": "maheshd20/Da_project_sem5", "description": null, "topics": [], "git_url": "git://github.com/maheshd20/Da_project_sem5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-30T13:22:44Z", "size": 5278, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1470956}, "last_updated": "2020-11-30T15:37:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.figure(figsize=(12, 12), facecolor='black')\nplt.imshow(img)\n", "model": "natural", "intent": "# Plot the counts"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n#%%\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n#%%\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n#%%\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n#%%\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n#%%\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n#%%\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n#%%\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n#%%\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n#%%\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n#%%\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n#%%\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()", "original_comment": "# ### Use a different marker for the hue levels:\n", "target_code": "palette = dict(Lunch=\"blue\", Dinner=\"red\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\", palette=palette,\n hue_order=[\"Dinner\", \"Lunch\"],\n hue_kws=dict(marker=[\"^\", \"v\"]))\n", "project_metadata": {"full_name": "scidatmath2020/Ciencia-de-datos-con-Python", "description": null, "topics": [], "git_url": "git://github.com/scidatmath2020/Ciencia-de-datos-con-Python.git", "stars": 20, "watchers": 20, "forks": 27, "created": "2020-09-07T20:49:59Z", "size": 20544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5705341, "Python": 12821}, "last_updated": "2020-11-19T22:06:09Z"}, "annotations": [{"completed_by": {"id": 1}, "coverage": "Strongly disagree", "coverage-score": 0, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\nsns.despine(offset=10, trim=True)\n", "model": "no-comments", "intent": "# Use a different marker for the hue levels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n\nfrom numpy.random import randint\nimport numpy as np\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n#%%\n\nfrom numpy.random import randint\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n#%%\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n#%%\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n#%%\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n#%%", "original_comment": "# plot the distribution of sample means\n", "target_code": "from matplotlib import pyplot as plt\n\nplt.hist(means_100)\nplt.show()\n", "project_metadata": {"full_name": "summerela/python_data_analysis", "description": "Introduction to Data Analysis with Python for UW Foster School of Business", "topics": [], "git_url": "git://github.com/summerela/python_data_analysis.git", "stars": 11, "watchers": 11, "forks": 27, "created": "2019-06-08T02:35:32Z", "size": 7972, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7883836}, "last_updated": "2020-11-09T16:54:13Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "np.mean(means_100)\n", "model": "no-comments", "intent": "# plot the distribution of sample means"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n\n# ## Data\n\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n\n# DATA_DF.content[0]\n\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n#%%\n\n# ## Data\n\n#%%\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n#%%\n\n# DATA_DF.content[0]\n\n#%%\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n#%%", "original_comment": "# remove reviews without audio features from Spotify\n", "target_code": "DATA_DF = DATA_DF.loc[~DATA_DF.audio_features.isna()]\n", "project_metadata": {"full_name": "iconix/openai", "description": "OpenAI Scholar, general materials", "topics": [], "git_url": "git://github.com/iconix/openai.git", "stars": 16, "watchers": 16, "forks": 3, "created": "2018-11-02T19:26:13Z", "size": 69033, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 22113154, "Python": 46353, "JavaScript": 8783, "Shell": 2297, "HTML": 970}, "last_updated": "2020-06-01T14:04:53Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df = DATA_DF.sample(frac=1).reset_index(drop=True)\n", "model": "no-comments", "intent": "# remove reviews without audio features from Spotify"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%", "original_comment": "# init the Support Vector Machine classifier\n", "target_code": "from sklearn.svm import SVC\n\nsvm = SVC(kernel='linear', random_state=random_seed)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "svm_clf = SVC(kernel='linear', C=1.0, random_state=42)\nsvm_clf.fit(X, y)\n", "model": "docstring", "intent": "# init the Support Vector Machine classifier"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n#%%\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n#%%\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n#%%\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n#%%", "original_comment": "# get list of r-band images\n", "target_code": "imgList_r = list(glob('Inputs/photo_r_CNN*.fits'))\n", "project_metadata": {"full_name": "cbottrell/RealSim", "description": "RealSim is the statistical observational realism suite described in Bottrell et al 2017ab and made public in Bottrell et al 2019b.", "topics": [], "git_url": "git://github.com/cbottrell/RealSim.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2019-07-10T21:26:45Z", "size": 20047, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2416365, "C": 294600, "Python": 34394, "Makefile": 4159, "Tcl": 1042, "Shell": 374, "C++": 88}, "last_updated": "2020-05-29T13:33:55Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Agree", "compatibility-score": 2, "precision": "Strongly disagree", "precision-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "ifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SDSS\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n", "model": "natural", "intent": "# get list of r-band images"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n\n# comments beginning with #BEE were written by bee martin\n\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n\n# GTR: Extract the airmass and filters for each observation\n\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n#%%\n\n# comments beginning with #BEE were written by bee martin\n\n#%%\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n#%%\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n#%%\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n#%%\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n#%%\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n#%%\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n#%%\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n#%%\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n#%%\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n#%%\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n#%%\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n#%%\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n#%%\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n#%%\n\n# GTR: Extract the airmass and filters for each observation\n\n#%%\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n#%%\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n#%%\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n#%%\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n#%%\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n#%%\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n#%%\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n#%%\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n#%%\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n#%%\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n#%%\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n#%%\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n#%%\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n#%%\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n#%%\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n#%%\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n#%%\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n#%%\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n#%%\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n#%%\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]", "original_comment": " # empty arrays to be filled\n", "target_code": " feature_distance = np.zeros((num_of_quasars, num_features, len(zshifts)))\n", "project_metadata": {"full_name": "RichardsGroup/LSSTprep", "description": "Repository for Richards group LSST prep work, specifically related to the AGN SC", "topics": [], "git_url": "git://github.com/RichardsGroup/LSSTprep.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2018-06-20T20:43:08Z", "size": 30265, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 8424521, "Python": 6419}, "last_updated": "2020-09-28T18:32:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "observed_u_20 = np.zeros((num_of_quasars, num_features))\nobserved_g_20 = np.zeros((num_of_quasars, num_features))\nobserved_u_3 = np.zeros((num_of_quasars, num_features))\nobserved_g_3 = np.zeros((num_of_quasars, num_features))\nobserved_u_gerr = np.zeros((num_of_quasars, num_features))\nobserved_g_gerr = np.zeros((num_of_quasars, num_features\n", "model": "natural", "intent": " # empty arrays to be filled"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)\nplt.legend()\nplt.savefig('lumpsum.png')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n#%%\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n#%%\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n#%%\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n#%%\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n#%%\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)", "original_comment": "# Show with Legend\n", "target_code": "plt.show()\n", "project_metadata": {"full_name": "eonofrey/DollarCostAverage_vs._LumpSum", "description": "Comparing dollar cost averaging vs. lump sum investment in the SPY ", "topics": [], "git_url": "git://github.com/eonofrey/DollarCostAverage_vs._LumpSum.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-06-19T21:58:51Z", "size": 1525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 587938}, "last_updated": "2020-12-19T01:53:56Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "\n", "model": "no-comments", "intent": "# Show with Legend"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n\ncorpus = train_df.bow\n\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n#%%\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n#%%\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n#%%\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n#%%\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n#%%\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n#%%\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n#%%\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n#%%\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n#%%\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n#%%\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n#%%\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n#%%\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n#%%\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n#%%\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n#%%\n\ncorpus = train_df.bow\n\n#%%\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n#%%\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n#%%\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n#%%\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n#%%\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n#%%\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n#%%\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n#%%\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n#%%\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n#%%\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n#%%\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n#%%\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n#%%\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n#%%\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n#%%\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n#%%\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n#%%\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n#%%\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n#%%\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n#%%\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n#%%\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n#%%\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n#%%\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n#%%\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n#%%\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n#%%\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n#%%\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n#%%\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n#%%\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n#%%\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n#%%\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n#%%\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n#%%\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])", "original_comment": "# Transform the dates from strings to integers\n", "target_code": "for i in range(0, len(date_ads)):\n date_ads[i] = int(date_ads[i])\n", "project_metadata": {"full_name": "NewsEye/NLP-Notebooks-Newspaper-Collections", "description": "A collection of notebooks for Natural Language Processing", "topics": ["lda", "topic-modeling", "shannon", "nlp-notebooks", "digital-humanities", "newspaper-collections", "newspaper-clippings", "text-classification", "similarity"], "git_url": "git://github.com/NewsEye/NLP-Notebooks-Newspaper-Collections.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-07-06T11:18:13Z", "size": 12866, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4306857}, "last_updated": "2020-12-01T08:54:40Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "date_ads = [int(x) for x in date_ads]\n", "model": "docstring", "intent": "# Transform the dates from strings to integers"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n#%%", "original_comment": "# Let's import the set\n", "target_code": "import pandas as pd\n\ndf = pd.read_csv('../data/train.csv', index_col=0)\n", "project_metadata": {"full_name": "commit-live-students/GLabs_DSMX", "description": null, "topics": [], "git_url": "git://github.com/commit-live-students/GLabs_DSMX.git", "stars": 6, "watchers": 6, "forks": 23, "created": "2020-03-27T12:43:39Z", "size": 19480, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12966885}, "last_updated": "2020-12-24T07:12:28Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "df = pd.read_csv(\n 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter10/Datasets/advertising.csv')\n", "model": "docstring", "intent": "# Let's import the set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n#%%\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport time\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n#%%\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n#%%\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n#%%\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n#%%\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))", "original_comment": "# So we can look at the progress on Tensorboard\n", "target_code": "import time\n\ncallback = keras.callbacks.TensorBoard(\n log_dir='./logs/inception/2/{}'.format(time.time())\n)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "def create_model(num_classes=len(labelencoder.classes_)):\n model = Sequential()\n model.add(Conv2D(32, kernel_size=(3, 3),\n activation='relu', input_shape=(224, 224, 3)))\n model.add(Conv2D(64, (3, 3), activation='relu'))\n model.add(MaxPooling2D(pool_size=(2, 2)))\n model.add(Dropout(0.25))\n model.add(Conv2D(128, (3, 3), activation='relu'))\n model.\n", "model": "no-comments", "intent": "# Enable callback to be able to look at the progress on Tensorboard"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)\n\n\n\nDATE = '04202020'\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)", "original_comment": "# ## Read in information\n", "target_code": "PLdb = pd.read_csv(\n '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))\n", "project_metadata": {"full_name": "MrOlm/covid19_population_genomics", "description": "Analysis of the population diversity of SARS-CoV-2 within and between individual patients", "topics": [], "git_url": "git://github.com/MrOlm/covid19_population_genomics.git", "stars": 9, "watchers": 9, "forks": 1, "created": "2020-03-20T16:01:19Z", "size": 170583, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 40959012, "Python": 1028}, "last_updated": "2020-12-05T12:24:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "data = pd.read_hdf(\n '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/hg38.h5', 'gencode')\n", "model": "docstring", "intent": "# Read in information"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n\n# should be version 1.x\nprint(tf.__version__)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n#%%\n\n# should be version 1.x\nprint(tf.__version__)\n\n#%%", "original_comment": "# the following directive activates inline plotting\n", "target_code": "get_ipython().run_line_magic('matplotlib', 'inline')\n", "project_metadata": {"full_name": "NeilAlishev/HiCPredictor", "description": "Predict Hi-C maps from the DNA sequence using deep convolutional neural networks", "topics": [], "git_url": "git://github.com/NeilAlishev/HiCPredictor.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-01-12T17:39:25Z", "size": 25045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 9881823, "Python": 17479}, "last_updated": "2020-11-13T16:32:28Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2, "coverage": "Strongly disagree", "coverage-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "plt.style.use('ggplot')\n", "model": "natural", "intent": "# activate inline plotting"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n#%%\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n#%%\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n#%%\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n#%%\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n#%%\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#", "original_comment": "# To compute the Frobenius norm:\n", "target_code": "A_frobenius = np.linalg.norm(A, 'fro')\n", "project_metadata": {"full_name": "garth-wells/notebooks-3M1", "description": "Jupyter notebooks (Python) for the course 3M1 at the Department of Engineering, University of Cambridge", "topics": ["linear-algebra", "singular-value-decomposition", "regression"], "git_url": "git://github.com/garth-wells/notebooks-3M1.git", "stars": 10, "watchers": 10, "forks": 18, "created": "2015-01-12T22:32:25Z", "size": 128315, "license": "bsd-2-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7472485}, "last_updated": "2021-01-04T10:34:46Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "A = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(np.linalg.norm(A, np.inf))\n", "model": "natural", "intent": "# To compute the Frobenius norm:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n\n# print the type of an object\ntype(df)\n\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n\n# list the index of the DataFrame\ndf.index\n\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n\n# print first 5 rows\ny.head()\n\n\n# print type of the ts object\ntype(y)\n\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n#%%\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n#%%\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n#%%\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n#%%\n\n# print the type of an object\ntype(df)\n\n#%%\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n#%%\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n#%%\n\n# list the index of the DataFrame\ndf.index\n\n#%%\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n#%%\n\n# print first 5 rows\ny.head()\n\n#%%\n\n# print type of the ts object\ntype(y)\n\n#%%\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n#%%\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n#%%\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n#%%\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n#%%", "original_comment": "# ask Yahoo! Finance for data\n", "target_code": "corporate_actions = web.DataReader(tickers, 'yahoo-actions', start, end)\n", "project_metadata": {"full_name": "dacatay/time-series-analysis", "description": "Presentation for time series analysis", "topics": [], "git_url": "git://github.com/dacatay/time-series-analysis.git", "stars": 41, "watchers": 41, "forks": 53, "created": "2017-09-08T13:45:56Z", "size": 43990, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12370243, "R": 4829}, "last_updated": "2020-11-05T10:34:15Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "tickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\nprint(panel)\n", "model": "natural", "intent": "# ask Yahoo! Finance for data"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n\nmodel.output_shape\n\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n\n15000/2\n\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n#%%\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n#%%\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n#%%\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n#%%\n\nmodel.output_shape\n\n#%%\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n#%%\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n#%%\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n#%%\n\n15000/2\n\n#%%\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n#%%\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n#%%", "original_comment": "# Plot a single hologram with the particles overlaid\n", "target_code": "def plot_hologram(h, img, outputs=\"none\"):\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "train_preds_scaled_one.head()\n", "model": "no-comments", "intent": "# Plot a single hologram with the particles overlaid"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n#%%\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n#%%\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n#%%\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n#%%\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n#%%\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n#%%\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n#%%\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n#%%\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n#%%\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n#%%\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n#%%\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n#%%\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n#%%\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n#%%\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n#%%\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n#%%\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n#%%\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n#%%\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n#%%\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n#%%\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n#%%\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n#%%\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n#%%\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n#%%\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n#%%\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n#%%\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n#%%", "original_comment": "# init the Support Vector Machine classifier\n", "target_code": "svm = SVC(kernel='linear', C=1, random_state=random_seed)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "mnist_train_data_hog_features = np.array(mnist_train_data_hog_features)\nmnist_train_data_hog_features.shape\n", "model": "no-comments", "intent": "# init the Support Vector Machine classifier"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n#%%\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n#%%\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n#%%\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n#%%\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):", "original_comment": " # pad zero at the end\n", "target_code": " res_1 = np.concatenate([res_1, np.zeros(len(mix)-len(res_1))])\n", "project_metadata": {"full_name": "naplab/DANet", "description": "Deep Attractor Network (DANet) for single-channel speech separation", "topics": [], "git_url": "git://github.com/naplab/DANet.git", "stars": 53, "watchers": 53, "forks": 15, "created": "2018-09-18T21:26:22Z", "size": 11, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23340, "Python": 4814}, "last_updated": "2020-12-14T07:40:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.figure(figsize=(15, 5))\nplt.subplot(1, 3, 1)\nlibrosa.display.specshow(res_1, sr=sr, x_axis='time', y_axis='linear')\nplt.colorbar()\nplt.subplot(1, 3, 2)\nlibrosa.display.specshow(res_2, sr=sr, x_axis='time', y_axis='linear')\nplt.colorbar()\nplt.subplot(1, 3, 3)\nlibrosa.display.specshow(mix, sr=sr, x_axis='time', y_axis\n", "model": "no-comments", "intent": " # pad zero at the end"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')\n\n\n\ngender_amt = pd.DataFrame(fraud_df.head(100), columns=['amt', 'gender'])\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n#%%\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n#%%\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')", "original_comment": "# ### Boxplots of Amount by Gender\n", "target_code": "plt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', x='gender', data=gender_amt, hue='gender',\n dodge=False, width=0.6, palette='Set2')\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "annotations": [{"completed_by": {"id": 1}, "coverage": "Disagree", "coverage-score": 1, "usefulness": "Disagree", "usefulness-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.tail(100), width=0.4, color='mediumpurple')\ngender_amt = pd.DataFrame(fraud_df.tail(100), columns=['amt', 'gender'])\n", "model": "no-comments", "intent": "# Boxplots of Amount by Gender"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n\ntrainingSummary = model.summary\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n#%%\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n#%%\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n#%%\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n#%%\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n#%%\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n#%%\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n#%%\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n#%%\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n#%%\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n#%%\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n#%%\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n#%%\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n#%%\n\ntrainingSummary = model.summary", "original_comment": "# train RMSE, MSE\n", "target_code": "print(\"RMSE: {}\".format(trainingSummary.rootMeanSquaredError))\nprint(\"MSE: {}\".format(trainingSummary.meanSquaredError))\nprint(\"R2: {}\".format(trainingSummary.r2))\n", "project_metadata": {"full_name": "dangkhoadl/my-BigData", "description": "A cache to store my Distributed System and Big Data resources", "topics": ["big-data", "coursera", "operating-systems", "distributed-systems", "cloud-computing"], "git_url": "git://github.com/dangkhoadl/my-BigData.git", "stars": 7, "watchers": 7, "forks": 8, "created": "2017-12-23T05:56:43Z", "size": 49086, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 888066, "C++": 48288, "Shell": 6317, "Python": 3334, "Makefile": 990}, "last_updated": "2020-01-21T03:30:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "trainingSummary.residuals.show()\n", "model": "no-comments", "intent": "# train RMSE, MSE"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nplt.imshow(img1)\n\n\nplt.imshow(img2)\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\n# blending images of same size\n\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nlarge_img = img1\nsmall_img = img2\n\n\nx_offset = 0\ny_offset = 0\n\n\n# in numpy x axis is vertical and y axis is horizontal\n\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n\n# Blend images of different sizes\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nimg1.shape\n\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n\nimg2.shape\n\n\nrows, cols, channels = img2.shape\n\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n\nmask_inv.shape\n\n\n# you can see the image is 2D now\n\n\n\n\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n\nplt.imshow(white_bgd)\n\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n\nlarge_img = img1\nsmall_img = final_roi\n\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n\n\n\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n\nret\n\n\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\nshow_img(img)\n\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n\n\n\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\ni = load_img()\nshow_img(i)\n\n\ngamma = 1/4\n\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n\nimg = load_img()\nshow_img(img)\n\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n\nimg = load_img()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n#%%\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n#%%\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n#%%\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n#%%\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nplt.imshow(img1)\n\n#%%\n\nplt.imshow(img2)\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\n# blending images of same size\n\n#%%\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n#%%\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nlarge_img = img1\nsmall_img = img2\n\n#%%\n\nx_offset = 0\ny_offset = 0\n\n#%%\n\n# in numpy x axis is vertical and y axis is horizontal\n\n#%%\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n#%%\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n#%%\n\n# Blend images of different sizes\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nimg1.shape\n\n#%%\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n#%%\n\nimg2.shape\n\n#%%\n\nrows, cols, channels = img2.shape\n\n#%%\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n#%%\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n#%%\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n#%%\n\nmask_inv.shape\n\n#%%\n\n# you can see the image is 2D now\n\n#%%\n\n\n\n#%%\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n#%%\n\nplt.imshow(white_bgd)\n\n#%%\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n#%%\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n#%%\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n#%%\n\nlarge_img = img1\nsmall_img = final_roi\n\n#%%\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n#%%\n\n\n\n#%%\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n#%%\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n#%%\n\nret\n\n#%%\n\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\nshow_img(img)\n\n#%%\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n#%%\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n#%%\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n#%%\n\n\n\n#%%\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\ni = load_img()\nshow_img(i)\n\n#%%\n\ngamma = 1/4\n\n#%%\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n#%%\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n#%%\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n#%%\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n#%%\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n#%%\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n#%%\n\nimg = load_img()\nshow_img(img)\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n#%%\n\nimg = load_img()", "original_comment": "# creating white noise\n", "target_code": "white_noise = np.random.randint(0, 2, size=(600, 600))\n", "project_metadata": {"full_name": "RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning", "description": null, "topics": [], "git_url": "git://github.com/RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning.git", "stars": 3, "watchers": 3, "forks": 5, "created": "2019-05-28T02:31:41Z", "size": 48363, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 98148466, "Python": 466}, "last_updated": "2020-12-21T09:24:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "\n", "model": "no-comments", "intent": "# creating white noise"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n#%%\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n#%%\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n#%%\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n#%%", "original_comment": "# get list of r-band images\n", "target_code": "imgList_r = list(glob('Inputs/photo_r_CNN*.fits'))\n", "project_metadata": {"full_name": "cbottrell/RealSim", "description": "RealSim is the statistical observational realism suite described in Bottrell et al 2017ab and made public in Bottrell et al 2019b.", "topics": [], "git_url": "git://github.com/cbottrell/RealSim.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2019-07-10T21:26:45Z", "size": 20047, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2416365, "C": 294600, "Python": 34394, "Makefile": 4159, "Tcl": 1042, "Shell": 374, "C++": 88}, "last_updated": "2020-05-29T13:33:55Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "ifu_base_path = 'Inputs/Datacubes/'\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\nredshift = 0.046\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\nbands = ['u', 'g', 'r', 'i']\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, '\n", "model": "no-comments", "intent": "# get list of r-band images"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n#%%\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n#%%\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n#%%\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n#%%\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n#%%", "original_comment": "# Check if there is any null information anywhere\n", "target_code": "train.isnull().any().any()\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train.isnull().sum()\ntest.isnull().sum()\n", "model": "docstring", "intent": "# Check if there is any null information anywhere"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n\ninput_shape\n\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n#%%\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n#%%\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n#%%\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n#%%\n\ninput_shape\n\n#%%\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n#%%\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n#%%\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n#%%\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n#%%\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n#%%\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n#%%\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))", "original_comment": "# Second Convolutional layer.\n", "target_code": "new_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "new_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "model": "natural", "intent": "# Second Convolutional layer."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n#%%\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n#%%\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n#%%\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n#%%\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n#%%\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)", "original_comment": "# print git status for the local repository\n", "target_code": "status_command = \"git status\"\noutput = subprocess.check_output(status_command, shell=True)\nprint(output)\n", "project_metadata": {"full_name": "uzh/helmchen-spark", "description": "Playbooks and other files to build a (virtual) Spark cluster for Prof. Helmchen's research group", "topics": [], "git_url": "git://github.com/uzh/helmchen-spark.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2016-03-23T21:54:52Z", "size": 6519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2819538, "Python": 37375, "Shell": 3482}, "last_updated": "2019-12-15T16:09:17Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "get_ipython().system('git status')\n", "model": "natural", "intent": "# print git status for the local repository"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\ndrop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n#%%\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n#%%\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\n#%%", "original_comment": "# Feature selection: remove variables no longer containing relevant information\n", "target_code": "train = train.drop(drop_elements, axis=1)\ntest = test.drop(drop_elements, axis=1)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train = train.drop(drop_elements, axis=1)\ntrain = train.drop(['CategoricalAge', 'CategoricalFare'], axis=1)\ntest = test.drop(drop_elements, axis=1)\n", "model": "no-comments", "intent": "# Feature selection: remove variables no longer containing relevant information"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n\n# Checking for data types\n# fraud_df.dtypes\n\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n\n#fraud_df = fraud_df.head(10000)\n\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n\nfraud_df.head()\n\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n\nfraud_df[cat_col_onehotencode].head()\n\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n\nfraud_df.head()\n\n\n# fraud_df.columns\n\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n#%%\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n#%%\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n#%%\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n#%%\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n#%%\n\n# Checking for data types\n# fraud_df.dtypes\n\n#%%\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n#%%\n\n#fraud_df = fraud_df.head(10000)\n\n#%%\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n#%%\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n#%%\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n#%%\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n#%%\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n#%%\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n#%%\n\nfraud_df[cat_col_onehotencode].head()\n\n#%%\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n#%%\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# fraud_df.columns\n\n#%%\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n#%%\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n#%%\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n#%%\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n#%%\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n#%%\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n#%%\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n#%%\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n#%%\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n#%%\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n#%%\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)", "original_comment": "# Summarize the new class distribution\n", "target_code": "counter = Counter(y_train)\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "annotations": [{"completed_by": {"id": 1}, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Strongly agree", "precision-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "counter = Counter(y_train)\nprint(counter)\n", "model": "natural", "intent": "# Summarize the new class distribution"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n\ntype(my_decimal)\n\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n#%%\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n#%%\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n#%%\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n#%%\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n#%%\n\ntype(my_decimal)\n\n#%%\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n#%%\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n#%%\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n#%%\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#", "original_comment": "# Lists can also be added to:\n", "target_code": "my_number_list.append(6)\n", "project_metadata": {"full_name": "wrightaprilm/CompBio2018", "description": null, "topics": [], "git_url": "git://github.com/wrightaprilm/CompBio2018.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2018-08-14T16:08:48Z", "size": 8976, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4697133, "TeX": 4884, "Python": 4399}, "last_updated": "2019-06-27T20:53:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "my_text = \"This is a string variable\"\nmy_text\nmy_number = 64\nmy_decimal = 1.64\ntype(my_decimal)\n", "model": "no-comments", "intent": "# add element to list"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n\nic.fit(inputs=s3_channels)\n\n\n\nendpoint_name = 'c5-'+time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n#%%\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n#%%\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n#%%\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n#%%\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n#%%\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n#%%\n\nic.fit(inputs=s3_channels)", "original_comment": "# ### Deploy the model\n", "target_code": "c5_predictor = ic.deploy(initial_instance_count=1,\n instance_type='ml.c5.large',\n endpoint_name=endpoint_name,\n wait=False)\n", "project_metadata": {"full_name": "PacktPublishing/Learn-Amazon-SageMaker", "description": "Learn Amazon SageMaker", "topics": [], "git_url": "git://github.com/PacktPublishing/Learn-Amazon-SageMaker.git", "stars": 30, "watchers": 30, "forks": 20, "created": "2020-04-22T14:55:25Z", "size": 47447, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2818256, "Python": 146100, "R": 2078, "Dockerfile": 738}, "last_updated": "2020-12-29T08:53:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "predictor = ic.deploy(initial_instance_count=1,\n instance_type='ml.m4.xlarge')\n", "model": "docstring", "intent": "# Deploy the model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n\nfrom numpy.random import randint\nimport numpy as np\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n#%%\n\nfrom numpy.random import randint\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n#%%\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n#%%\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n#%%\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n#%%", "original_comment": "# plot the distribution of sample means\n", "target_code": "from matplotlib import pyplot as plt\n\nplt.hist(means_100)\nplt.show()\n", "project_metadata": {"full_name": "summerela/python_data_analysis", "description": "Introduction to Data Analysis with Python for UW Foster School of Business", "topics": [], "git_url": "git://github.com/summerela/python_data_analysis.git", "stars": 11, "watchers": 11, "forks": 27, "created": "2019-06-08T02:35:32Z", "size": 7972, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7883836}, "last_updated": "2020-11-09T16:54:13Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "compatibility": "Disagree", "compatibility-score": 1, "precision": "Strongly agree", "precision-score": 3, "coverage": "Agree", "coverage-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "fig, ax = plt.subplots()\nax.hist(means_100)\n", "model": "natural", "intent": "# plot the distribution of sample means"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n\ndata.dtypes\n\n\ndata.size\n\n\ndata.info()\n\n\ndata['RAM'].describe()\n\n\ndata.describe()\n\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n\ndf.info()\n\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n\nlen(df)\n\n\ndf.fillna(np.nan)\ndf\n\n\ndf_dropped = df.dropna()\ndf_dropped\n\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n\ndf.head(2)\n\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n\n# checking unique values\ndfm['GPRS'].unique()\n\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n\n# checking data types\ndfm.dtypes\n\n\n# count of every column\ndfm.count()\n\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n\ndf1 = df_dropped\nlen(df1)\n\n\n# checking the datatypes\ndf1.dtypes\n\n\n# displaying info\ndf1.info()\n\n\ndf1.head()\n\n\ndf1.tail()\n\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n#%%\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n#%%\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n#%%\n\ndata.dtypes\n\n#%%\n\ndata.size\n\n#%%\n\ndata.info()\n\n#%%\n\ndata['RAM'].describe()\n\n#%%\n\ndata.describe()\n\n#%%\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n#%%\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n#%%\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n#%%\n\nlen(df)\n\n#%%\n\ndf.fillna(np.nan)\ndf\n\n#%%\n\ndf_dropped = df.dropna()\ndf_dropped\n\n#%%\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n#%%\n\ndf.head(2)\n\n#%%\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n#%%\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking unique values\ndfm['GPRS'].unique()\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n#%%\n\n# count of every column\ndfm.count()\n\n#%%\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n#%%\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n#%%\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n#%%\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n#%%\n\ndf1 = df_dropped\nlen(df1)\n\n#%%\n\n# checking the datatypes\ndf1.dtypes\n\n#%%\n\n# displaying info\ndf1.info()\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf1.tail()\n\n#%%\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n#%%\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n#%%", "original_comment": "# changing the datatype\n", "target_code": "df1[['display_size']] = df1[['display_size']].apply(pd.to_numeric)\n", "project_metadata": {"full_name": "yatinagg/Mobile_Price_Classification", "description": "Dataritz Phone Price Classification", "topics": [], "git_url": "git://github.com/yatinagg/Mobile_Price_Classification.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-05-18T15:04:18Z", "size": 6525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4727409, "Python": 568}, "last_updated": "2020-08-30T08:37:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Strongly agree", "precision-score": 3}], "predicted_code": "df1['display_size'] = dfm['display_size'].apply(pd.to_numeric)\ndf1\n", "model": "docstring", "intent": "# changing the datatype"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n#%%\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n#%%\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n#%%\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n#%%\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n#%%\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n#%%\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n#%%\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n#%%\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)", "original_comment": "# ### The following code cells shape the new arrays.\n", "target_code": "np.shape(d_norm)\n", "project_metadata": {"full_name": "abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation", "description": "initial commit", "topics": [], "git_url": "git://github.com/abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2020-11-06T17:59:31Z", "size": 13456, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5835338, "Python": 6376}, "last_updated": "2020-11-12T20:56:51Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "d_norm = np.shape(d_norm)\nli_norm = np.shape(li_norm)\n", "model": "docstring", "intent": "# shape array"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n\ntrain_df.head()\n\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n#%%\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n#%%\n\ntrain_df.head()\n\n#%%\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n#%%\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)", "original_comment": "# To cast data into float32 type\n", "target_code": "train_data = train_data.astype('float32')\n", "project_metadata": {"full_name": "aditya2000/MNIST-Fashion-", "description": null, "topics": [], "git_url": "git://github.com/aditya2000/MNIST-Fashion-.git", "stars": 3, "watchers": 3, "forks": 3, "created": "2019-07-10T10:06:01Z", "size": 40, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 76938}, "last_updated": "2020-09-28T23:05:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "train_data = train_data/255\ntest_data = test_data/255\n", "model": "docstring", "intent": "# To cast data into float32 type"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n\ntype(raw_text)\n\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n\nraw_text[:100]\n\n\n# ## Tokenization\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n#%%\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom nltk import word_tokenize\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n#%%\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n#%%\n\ntype(raw_text)\n\n#%%\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n#%%\n\nraw_text[:100]\n\n\n# ## Tokenization", "original_comment": "# Turning the text into words using the nltk word_tokenizer\n", "target_code": "from nltk import word_tokenize\n\nwords_text = word_tokenize(raw_text)\n", "project_metadata": {"full_name": "derekjjackson/DH_PythonLibraries_JupyterNotebooks", "description": "FIles and resources for using Data Science, Python, and Jupyter Notebooks in the practice of Digital Humanities", "topics": [], "git_url": "git://github.com/derekjjackson/DH_PythonLibraries_JupyterNotebooks.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2018-10-20T15:06:33Z", "size": 29200, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 10076299}, "last_updated": "2020-12-25T21:05:12Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "raw_text = raw_text.replace('\\n', '')\nraw_text = raw_text.replace('\\r', '')\nraw_text = raw_text.replace('\\t', '')\nraw_text = raw_text.replace('\\f', '')\nraw_text = raw_text.replace('\\xa0', '')\nraw_text\n", "model": "no-comments", "intent": "# Turning the text into words using the nltk word_tokenizer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n#%%\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n#%%\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n#%%\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n#%%\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n#%%\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n#%%\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n#%%\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]", "original_comment": "# Merge the obtained results with the pulls DataFrame\n", "target_code": "joined_pr = file_pr.merge(pulls, on='pid')\n", "project_metadata": {"full_name": "ChristianNogueira/datacamp_projects", "description": "DataCamp Projects", "topics": ["datacamp"], "git_url": "git://github.com/ChristianNogueira/datacamp_projects.git", "stars": 17, "watchers": 17, "forks": 13, "created": "2018-01-17T16:58:27Z", "size": 8441, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12129948}, "last_updated": "2020-08-21T20:03:31Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "file ='src/compiler/scala/reflect/reify/phases/Calculate.scala'\nfile_pr = data[data['file'] == file]\njoined_pr = file_pr.merge(pulls, on='pid')\nusers_last_10 = set(joined_pr.nlargest(10, 'date_x')['user'])\nusers_last_10\n", "model": "no-comments", "intent": "# Merge the obtained results with the pulls DataFrame"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata.isnull().sum()\n\n\ndata.dropna(inplace=True)\n\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n#%%\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata.isnull().sum()\n\n#%%\n\ndata.dropna(inplace=True)\n\n#%%\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n#%%\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)", "original_comment": "# ### Compute the explained variance for new data set.\n", "target_code": "pca.explained_variance_\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "explained_variance = pca.explained_variance_ratio_\nexplained_variance\n", "model": "docstring", "intent": "# Compute the explained variance for new data set."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n#%%\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n#%%", "original_comment": "# Let's separate into train and test set\n", "target_code": "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n data[['age', 'fare']],\n data['survived'],\n test_size=0.3,\n random_state=0)\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train, test = train_test_split(data, test_size=0.2)\n", "model": "docstring", "intent": "# Let's separate into train and test set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n\n# comments beginning with #BEE were written by bee martin\n\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n\n# GTR: Extract the airmass and filters for each observation\n\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n#%%\n\n# comments beginning with #BEE were written by bee martin\n\n#%%\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n#%%\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n#%%\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n#%%\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n#%%\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n#%%\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n#%%\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n#%%\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n#%%\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n#%%\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n#%%\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n#%%\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n#%%\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n#%%\n\n# GTR: Extract the airmass and filters for each observation\n\n#%%\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n#%%\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n#%%\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n#%%\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n#%%\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n#%%\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n#%%\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n#%%\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n#%%\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n#%%\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n#%%\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n#%%\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n#%%\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n#%%\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n#%%\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n#%%\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n#%%\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n#%%\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n#%%\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n#%%\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]", "original_comment": " # empty arrays to be filled\n", "target_code": " feature_distance = np.zeros((num_of_quasars, num_features, len(zshifts)))\n", "project_metadata": {"full_name": "RichardsGroup/LSSTprep", "description": "Repository for Richards group LSST prep work, specifically related to the AGN SC", "topics": [], "git_url": "git://github.com/RichardsGroup/LSSTprep.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2018-06-20T20:43:08Z", "size": 30265, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 8424521, "Python": 6419}, "last_updated": "2020-09-28T18:32:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "zshifts = np.full((num_of_quasars,), np.nan)\nfeature_zshift_fit = np.full((num_of_quasars,), np.nan)\n# Return new array of given shape and type, filled with 'fill_value'\nfeature_covariance = np.full((num_of_quasars,), np.nan)\n", "model": "docstring", "intent": " # empty arrays to be filled"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n#%%\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n#%%", "original_comment": "# Let's separate into train and test set\n", "target_code": "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n data[['age', 'fare']],\n data['survived'],\n test_size=0.3,\n random_state=0)\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "X = data[['age', 'fare','survived']]\nX.head()\n", "model": "no-comments", "intent": "# Let's separate into train and test set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n#%%\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n#%%\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n#%%\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n#%%\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n#%%\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n#%%\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n#%%\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n#%%\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n#%%\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224", "original_comment": "# Loading in the data.\n", "target_code": "data = ImageClassifierData.from_paths(\n PATH, tfms=tfms_from_model(arch, sz), bs=64)\n", "project_metadata": {"full_name": "zachmonge/fish_computer_vision_example", "description": "This is the repository corresponding to my Medium blog post titled \"Does Deep Learning Really Require 'Big Data'? --No!\"", "topics": [], "git_url": "git://github.com/zachmonge/fish_computer_vision_example.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2018-08-20T03:51:12Z", "size": 8148, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1391423}, "last_updated": "2020-02-13T19:27:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "arch = resnet34\ndata = ImageClassifierData.from_paths(PATH, tfms=tfms_from_model(arch, sz))\nlearn = ConvLearner.pretrained(arch, data, precompute=True)\n", "model": "natural", "intent": "# Loading in the data."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n#%%\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n#%%\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n#%%\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n#%%\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n#%%\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n#%%\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n#%%\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n#%%\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n#%%\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n#%%\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n#%%\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n#%%\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n#%%\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n#%%\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n#%%\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n#%%\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n#%%\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n#%%\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n#%%\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n#%%\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n#%%\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n#%%\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n#%%\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n#%%\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n#%%\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n#%%\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n#%%", "original_comment": "# init the Support Vector Machine classifier\n", "target_code": "svm = SVC(kernel='linear', C=1, random_state=random_seed)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "svc = svm.LinearSVC()\nsvc.fit(train_features, train_labels)\n", "model": "docstring", "intent": "# init the Support Vector Machine classifier"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n#%%\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n#%%\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n#%%\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n#%%\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n#%%\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n#%%\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n#%%\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n#%%\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n#%%\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n#%%\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)", "original_comment": "# ### Put them all together in multiplots\n", "target_code": "pzb.plot_multi(names=['fof', 'zz', 'pdf', 'pit'], verbose=1, save_plot=1)\n", "project_metadata": {"full_name": "LSSTDESC/pz_blend", "description": "impact of blending on photo-zs using DC2 truth catalogs and image catalogs", "topics": [], "git_url": "git://github.com/LSSTDESC/pz_blend.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-03-12T22:06:14Z", "size": 2183, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1709826, "Python": 89195}, "last_updated": "2020-12-09T18:50:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "pzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\npzb.fof_results\n", "model": "docstring", "intent": "# Put them all together in multiplots"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n#%%\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import optimizers\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n#%%\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n#%%\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n#%%\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n#%%\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n#%%\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n#%%\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))", "original_comment": "# compling and show model\n", "target_code": "from keras import optimizers\n\nmodel.compile(loss=\"categorical_crossentropy\", optimizer=optimizers.SGD(\n lr=0.0001, momentum=0.9), metrics=[\"accuracy\"])\nmodel.summary()\n", "project_metadata": {"full_name": "WuZhuoran/Plant_Seedlings_Classification", "description": "Kaggle Competition Project as well as ANLY 590 Final Project. Task: Determine the species of a seedling from an image", "topics": [], "git_url": "git://github.com/WuZhuoran/Plant_Seedlings_Classification.git", "stars": 10, "watchers": 10, "forks": 7, "created": "2018-10-31T01:19:27Z", "size": 10167, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2140227, "Python": 31477}, "last_updated": "2020-12-18T16:42:52Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model.compile(loss='categorical_crossentropy',\n optimizer='adam', metrics=['accuracy'])\n", "model": "no-comments", "intent": "# compling and show model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n#%%\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n#%%\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n#%%\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n#%%\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n#%%\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n#%%\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n#%%\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n#%%\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n#%%\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n#%%\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n#%%\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n#%%\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')", "original_comment": "# making predictions on the testing set\n", "target_code": "y_pred = lr.predict(X_test)\n", "project_metadata": {"full_name": "asabenhur/CS345", "description": "Jupyter", "topics": [], "git_url": "git://github.com/asabenhur/CS345.git", "stars": 4, "watchers": 4, "forks": 11, "created": "2020-08-11T19:32:02Z", "size": 6413, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4808835}, "last_updated": "2020-12-30T20:50:00Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "get_ipython().run_line_magic('timeit',\n 'lr.fit(StandardScaler().fit_transform(X_train), y_train)')\n", "model": "no-comments", "intent": "# making predictions on the testing set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n\n# comments beginning with #BEE were written by bee martin\n\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n\n# GTR: Extract the airmass and filters for each observation\n\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook takes simulated quasar colors and DCR slopes. Takes a single pointing from a single opSim and pretends that all quasars were observed at that point. Then it simulates an \"observation\" in both the u and g band at every time of observation, including astrometric error. Then it fits a line between the \"observed\" DCR offset and tan Z. This slope is a function of redshift. The slope determined from either 3 observations, 20 observations, or all the observations is compared to the \"known\" slope.\n#\n# There is a lot of code that isn't necessary to do the above that can get cleaned up. It would also be good to assign each quasar to a different position on the sky. Also to enable this to sample many different opSims.\n#\n# A potential metric could be found by taking 1-2 redshifts where the slope is close to 0 (DCR is not important) -- maybe where the u and g slopes are predicted to have opposite signs. Then calculate the chi-square (or similar) for the slope determined from all of the observations for all the objects. It should correlate highly with the number of u- and g-band observations and the airmass of those observations, which may perhaps lead to a simpler metric that doesn't actually involve DCR at all (at least directly).\n\n#%%\n\n# comments beginning with #BEE were written by bee martin\n\n#%%\n\nimport pandas as pd\nfrom matplotlib import rc\nimport random\nimport math\nimport astropy\nfrom astropy.io import ascii\nimport numpy as np\nimport emcee\nfrom scipy.optimize import minimize\nfrom numpy.random import normal\nfrom numpy.random import uniform\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport palettable\nimport richardsplot as rplot\nget_ipython().run_line_magic('matplotlib', 'inline')\nrc('text', usetex=False)\n\n\n# ## open file with photo-z PDF redshift bins\n\n#%%\n\n# BEE: read in table of redshifts and save the 'zshifts' column as a variable named zshifts\n# BEE: zshifts is a list of redshifts from 0.4 to 4.0\n# GTR: This is just a list of redshift bins\n\n#%%\n\nzshifts_Table = ascii.read('fittingS82_zshifts.dat', format='csv')\nzshifts = zshifts_Table['zshifts']\n\n\n# ## open file with regression values\n\n#%%\n\n# BEE: create an array of sdss features\n# BEE: read in table of regression values, create array of zeros with shape(features, redshifts)\n# BEE: fill array of zeros with data from regression values table\n# GTR: These are the mean colors and DCR slopes for the above redshift bins\n\n#%%\n\nsdss_features = ['u-g', 'g-r', 'r-i', 'i-z']\nsdss_features_dcr = ['u-g', 'g-r', 'r-i', 'i-z', 'u-slope', 'g-slope']\n\ncolor_fit_Table = ascii.read('fittingS82_zshiftfit.dat')\ncolor_fit_Table.remove_column('col1')\ncolor_fit = np.zeros((len(sdss_features), len(zshifts)))\ncolor_fit_dcr = np.zeros((len(sdss_features_dcr), len(zshifts)))\nfor i in range(len(sdss_features)):\n for j in range(len(zshifts)):\n color_fit[i, j] = np.asarray(color_fit_Table[i][j])\n\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(zshifts)):\n color_fit_dcr[i, j] = np.asarray(color_fit_Table[i][j])\n\n\n# ## open file with regression covariance values\n\n#%%\n\n# BEE: read in regression covariance data\n# BEE: create array of zeros with shape (features, features, redshifts), fill it with covariance table data\n# GTR: These are the covariances between each of the above parameters at each redshift\n\n#%%\n\ncolor_covariance_Table = ascii.read('fittingS82_zshiftcovariance.dat')\ncolor_covariance_Table.remove_column('col1')\ncolor_covariance_Table.remove_column('col2')\ncolor_covariance = np.zeros(\n (len(sdss_features), len(sdss_features), len(zshifts)))\ncolor_covariance_dcr = np.zeros(\n (len(sdss_features_dcr), len(sdss_features_dcr), len(zshifts)))\nl = 0\nfor i in range(len(sdss_features_dcr)):\n for j in range(len(sdss_features_dcr)):\n for k in range(len(zshifts)):\n color_covariance_dcr[i, j, k] = np.asarray(\n color_covariance_Table[l][k])\n l += 1\ncolor_covariance = color_covariance_dcr[:4, :4, :]\n# print(color_covariance_dcr)\n# print(color_covariance)\n\n\n# ## open file with the simulated quasar true values\n\n#%%\n\n# BEE: Read in simulated \"true\" quasar data\n# GTR: These are simulated quasars with simulated parameters (and their errors)\n\n#%%\n\ntest_quasars0 = ascii.read('random_quasars.dat')\ntest_quasars = ascii.read('random_quasars100k.dat')[:1000]\nprint(test_quasars.keys())\n\n\n# ## define the observations\n\n#%%\n\n# BEE: simulate airmass observations in u ang g\n# GTR: We ignore the next cell?\n\n#%%\n\nastrometric_error = [0.035, 0.025] # [u-band error, g-band error]\n\nairmasses = uniform(low=1.0, high=1.3, size=50)\nairmasses = np.append(airmasses, uniform(low=1.3, high=2.0, size=14))\n\nfilters = np.tile(['u', 'g'], int(len(airmasses)/2))\n\n#%%\n\n# BEE: this cell will take observations from the OpSim rather than simulating them\n# GTR: Not sure exactly where this opSim information comes from. Weixiang?\n# id.csv is just an indexed list of RA and Dec\n# dcr_all.csv is a list of observation parameters for each of those IDs\n# this includes airmass and filter, which is all that we use right now?\n# It seems that right now a random object is being chosen?\n\n#%%\n\nastrometric_error = [0.035, 0.025]\n#astrometric_error = np.multiply(astrometric_error, [2,2])\nprint(astrometric_error)\n# Weixiang: import opsim cadence after fix for python2\nids = pd.read_csv('id.csv')\ncad = pd.read_csv('dcr_all.csv')\n\n# pick random object's cadence\nrandom_cadence = random.randint(0, max(cad['id']))\n# assign the cadence of random object to dcr_0\ndcr_0 = cad[cad['id'] == random_cadence].copy()\nobs_g = dcr_0[dcr_0['filter'] == 'g']\nobs_u = dcr_0[dcr_0['filter'] == 'u']\nobs = np.concatenate((obs_g, obs_u))\n\n# Orginal code to import cadence\n# dcr = np.load('dcr.npz')\n# print(list(dcr.keys()))\n# dcrra_dec = dcr['ra_dec']\n# dcrdata = dcr['data']\n# print(dcrra_dec[0])\n# obs_g = dcrdata[0][dcrdata[0]['filter']=='g']\n# obs_u = dcrdata[0][dcrdata[0]['filter']=='u']\n# obs = np.concatenate((obs_g, obs_u))\n\n\n# GTR: (24 July 2020) I don't recall what these comments are about. Should take another look at them.\n#\n# GTR: Split out cell that defines airmasses. Just define one at a time. Predefine the experiments and comment out the ones being run each time. Make sure that the output files are unique for each experiment.\n#\n# GTR: Run colors only and colors+normal DCR just once. We don't need to run those again. But those can be the first 2 \"experiments\".\n\n#%%\n\n# GTR: Extract the airmass and filters for each observation\n\n#%%\n\n# Weixiang: modified the item index to match the order of columns in new file\nairmasses = np.array([item[3] for item in obs])\nfilters = np.array([item[5] for item in obs])\n\n#airmasses_long = np.append(airmasses, [1.6, 1.6])\n#filters_long = np.append(filters, ['g', 'g'])\n#airmasses_twilight = np.append(airmasses, [2.0, 2.0])\n#filters_twilight = np.append(filters, ['g', 'g'])\n\n\n# BEE: The next cell is a switch that lets you choose the experiment to run. There are 2 types of experiments: 'substitution' and 'addition'. Change the string in the cell to either 'substitution' or 'addition'. The airmasses should be 1.6, 1.7, 1.8, 1.9, or 2.0. In the case of addition, you can set airmass_to_use to an array of airmasses and it will add all of them. NOTE: Make sure, if you're running multiple experiments, to run the cell above for each one so you don't overwrite the wrong airmasses array.\n\n#%%\n\n# GTR: Let's not do that experiment any more and just explore the different opSims.\n# So either take this out or just leave the array blank.\n\n#%%\n\nexperiment_to_run = 'addition'\n#experiment_to_run = 'substitution'\n#experiment_to_run = 'addition'\nairmass_to_use = []\n\n#%%\n\nif experiment_to_run == 'colors':\n save_file_name = 'AstroMetric_Colors_noDCR.npz'\n\n#%%\n\nif experiment_to_run == 'substitution':\n airmass_to_substitute = airmass_to_use[0]\n index_of_lowest = np.argmin(airmasses)\n airmasses[index_of_lowest] = airmass_to_substitute\n save_file_name = 'AstroMetric_SubstitutionDCR_' + \\\n str(int(airmass_to_substitute*10)) + '.npz'\n\n#%%\n\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\n\n#%%\n\n# GTR: Not sure why this is here\n# and not clear that this file name is being used\n# I think that Bee was just trying to compare the results after 20 and 3 observations.\n\n#%%\n\n# airmass removal cell\nprint(len(airmasses))\n# if you don't want to remove any, set number_to_leave to \"all\"\nnumber_to_leave = 20\nnumber_to_leave = \"all\"\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\n\n#%%\n\nprint(len(airmasses))\nprint(airmasses)\nprint(filters)\nprint(save_file_name)\n\n#%%\n\n# GTR: I think that this is just to provide a basis of comparison with just a few (here 3) epochs.\n\n#%%\n\nairmasses_20 = airmasses\nfilters_20 = filters\nif experiment_to_run == 'addition':\n filters_to_add = np.tile('g', int(len(airmass_to_use)))\n airmasses = np.append(airmasses, airmass_to_use)\n filters = np.append(filters, filters_to_add)\n save_file_name = 'AstroMetric_TwilightDCR_' + \\\n str([int(airmass_to_use[i]*10)\n for i in range(len(airmass_to_use))]) + '.npz'\nnumber_to_leave = 3\nif number_to_leave != \"all\":\n save_file_name = save_file_name[:-4] + \"_\" + \\\n str(number_to_leave) + \"obs\" + save_file_name[-4:]\n print(\"file name is \" + save_file_name)\n number_to_remove = len(airmasses) - number_to_leave\nelse:\n number_to_remove = 0\nremoved = 0\nwhile removed < number_to_remove:\n remove_index = random.randint(0, len(airmasses)-1)\n airmasses = np.delete(airmasses, remove_index)\n filters = np.delete(filters, remove_index)\n removed += 1\nairmasses_3 = airmasses\nfilters_3 = filters\n\n\n# ## generate observed slopes from true slopes and observations\n\n#%%\n\n# BEE: lnlike calculates the loglikelihood, lnprior creates a prior on our linear fits, lnprob adds the prior to lnlike\n# BEE: run_fit runs the mcmc walkers over a range of linear fits and selects the median as the best fit and half the\n# difference between 16th and 84th percentiles as the error\n# GTR: run_fit is computing the slope in the offset vs. tanZ plane for a single object\n\n#%%\n\ndef lnlike(theta, x, y, yerr):\n m, lnf = theta\n model = m*x\n inv_sigma2 = 1.0/(yerr**2. + model**2.*np.exp(2.*lnf))\n return -0.5*(np.sum(((y-model)**2.*inv_sigma2 - np.log(inv_sigma2))))\n\n\ndef lnprior(theta):\n m, lnf = theta\n if (-1.0 < m < 1.0) and (-100.0 < lnf < 100.0):\n return 0.0\n return -np.inf\n\n\ndef lnprob(theta, x, y, yerr):\n lp = lnprior(theta)\n if not np.isfinite(lp):\n return -np.inf\n return lp + lnlike(theta, x, y, yerr)\n\n\ndef run_fit(tanZList, RList, RerrList):\n nll = lambda *args: -lnprob(*args)\n x = np.copy(tanZList)\n y = np.copy(RList)\n yerr = np.copy(RerrList)\n # first do a simple minimization to get starting values for mcmc\n pm = np.random.choice([-1.0, 1.0], size=len(x), replace=True)\n result = minimize(nll, [-0.001, np.log(0.5)], args=(x, y, yerr))\n m_ml, lnf_ml = result[\"x\"]\n # now run mcmc\n ndim, nwalkers = 2, 100\n pos = [result[\"x\"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)]\n sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))\n sampler.run_mcmc(pos, 500)\n samples = sampler.chain[:, 50:, :].reshape((-1, ndim))\n ms = samples[np.random.randint(len(samples), size=100)][:, 0]\n # return the median walker as the best slope and the half the 16-84th percentiles as the error\n m_mcmc, lnf_mcmc = map(lambda v: (v[1]), zip(\n *np.percentile(samples, [16, 50, 84], axis=0)))\n merr_mcmc, lnf_mcmc = map(lambda v: (\n 0.5*(v[2]-v[0])), zip(*np.percentile(samples, [16, 50, 84], axis=0)))\n return m_mcmc, merr_mcmc\n\n\n# GTR: Split out cells that define functions from cells that make calls to those functions.\n\n#%%\n\n# GTR: dcrSlopeCalc is computing the slope in the offset vs. tanZ plane for all the objects, calling run_fit for each\n\n#%%\n\ndef dcrSlopeCalc(airmasses, filters, test_quasars, makePlot=True):\n astrometric_error = [0.035, 0.025]\n obs_slopes_u = np.zeros((len(test_quasars)))\n obs_slopes_uerr = np.zeros((len(test_quasars)))\n obs_slopes_g = np.zeros((len(test_quasars)))\n obs_slopes_gerr = np.zeros((len(test_quasars)))\n imgNumString = 0\n xAxis = np.linspace(0, 2.0, 100)\n for i in range(len(test_quasars)):\n true_slope_u = test_quasars['u-slope'][i]\n true_slope_g = test_quasars['g-slope'][i]\n\n tanZList_u = np.array([])\n RerrList_u = np.array([])\n RList_u = np.array([])\n tanZList_g = np.array([])\n RerrList_g = np.array([])\n RList_g = np.array([])\n\n for j, airmass in enumerate(airmasses):\n # tangent of zenith angle of this observation\n tanZ_obs = np.tan(np.arccos(1.0/airmass))\n if filters[j] == 'u':\n # calculate the observed offset\n # random scatter around the true offset using a normal distribution with the astrometric error as the standard deviation\n R_obs = normal(true_slope_u*tanZ_obs, astrometric_error[0])\n # list of x axis values\n tanZList_u = np.append(tanZList_u, tanZ_obs)\n # list of y axis error values\n RerrList_u = np.append(RerrList_u, astrometric_error[0])\n RList_u = np.append(RList_u, R_obs) # list of y axis values\n if filters[j] == 'g':\n R_obs = normal(true_slope_g*tanZ_obs, astrometric_error[1])\n tanZList_g = np.append(tanZList_g, tanZ_obs)\n RerrList_g = np.append(RerrList_g, astrometric_error[1])\n RList_g = np.append(RList_g, R_obs)\n\n # fit a stright line through the x and y values, using the y-err values\n m_mcmc_u, merr_mcmc_u = run_fit(tanZList_u, RList_u, RerrList_u)\n m_mcmc_g, merr_mcmc_g = run_fit(tanZList_g, RList_g, RerrList_g)\n if makePlot == True:\n bestFitLine_u = m_mcmc_u*xAxis + 0.0\n bestFitLine_g = m_mcmc_g*xAxis + 0.0\n trueFitLine_u = true_slope_u*xAxis + 0.0\n trueFitLine_g = true_slope_g*xAxis + 0.0\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.title('u-band observations + fit')\n plt.scatter(tanZList_u, RList_u, label='Observations')\n plt.plot(xAxis, bestFitLine_u, label='Fit Line')\n plt.plot(xAxis, trueFitLine_u, label='True Line')\n plt.legend()\n plt.xlabel('Tan(Z)')\n plt.ylabel('delta R')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_u, y=RList_u)\n plt.subplot(122)\n plt.title('g-band observations + fit')\n plt.scatter(tanZList_g, RList_g, label='Observations')\n plt.plot(xAxis, bestFitLine_g, label='Fit Line')\n plt.plot(xAxis, trueFitLine_g, label='True Line')\n plt.xlabel('Tan(Z)')\n plt.xlim(0.0, 2.0)\n plt.scatter(x=tanZList_g, y=RList_g)\n filename = \"TanZimgFiles/airmassOffsetFit\" + \\\n str(len(airmasses))+\"_\"+\"{:0>5d}\".format(imgNumString)\n plt.savefig(filename)\n plt.clf()\n plt.close()\n imgNumString += 1\n obs_slopes_u[i] = m_mcmc_u\n obs_slopes_uerr[i] = merr_mcmc_u\n obs_slopes_g[i] = m_mcmc_g\n obs_slopes_gerr[i] = merr_mcmc_g\n if makePlot == True:\n deltaSlope_u = []\n deltaSlope_g = []\n for i in range(len(obs_slopes_u)):\n deltaSlope_u = np.append(\n deltaSlope_u, test_quasars['u-slope'][i] - obs_slopes_u[i])\n for i in range(len(obs_slopes_g)):\n deltaSlope_g = np.append(\n deltaSlope_g, test_quasars['g-slope'][i] - obs_slopes_g[i])\n plt.figure(figsize=(12, 12))\n plt.subplot(121)\n plt.hist(deltaSlope_u, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope u-band '+str(len(airmasses)))\n plt.subplot(122)\n plt.hist(deltaSlope_g, bins=50, range=(-0.3, 0.3))\n plt.title('Delta Slope g-band '+str(len(airmasses)))\n filename = \"DeltaSlopeimgFiles/deltaSlopeHist\" + str(len(airmasses))\n plt.savefig(filename)\n return obs_slopes_u, obs_slopes_uerr, obs_slopes_g, obs_slopes_gerr\n\n#%%\n\n# GTR: This cell actually calls the code that computes the slopes\n# This is taking every object in the test set and treating them as if they were observed at the same position\n# on the sky from the simulation. That's why the number of airmasses is the same. Some of them are u and\n# some are g.\n\n#%%\n\nobs_slopes_u_20, obs_slopes_uerr, obs_slopes_g_20, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_20, filters_20, test_quasars)\nobs_slopes_u_3, obs_slopes_uerr, obs_slopes_g_3, obs_slopes_gerr = dcrSlopeCalc(\n airmasses_3, filters_3, test_quasars)\n\n#%%\n\nsort_indices = np.argsort(test_quasars['zspec'])\nplt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_3[sort_indices],\n color='magenta', alpha=0.5, label='Observed u slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('u-band DCR slope')\nplt.subplot(212)\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['g-slope']\n [sort_indices], color='blue', label='True g slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_20[sort_indices],\n color='black', label='Observed g slope@20 obs', alpha=0.7)\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_g_3[sort_indices],\n color='cyan', alpha=0.5, label='Observed g slope@3 obs')\nplt.legend(loc='upper right')\nplt.ylabel('g-band DCR slope')\nplt.xlabel('Redshift')\n\n#%%\n\n# GTR: I have ignored everything past here.\n# I was more concerned about making sure that we could reproduce the above plot.\n\n\n# ## calculate redshift PDFs for observed quasars\n\n#%%\n\ndef calculate_PDFs(parameters, zshifts, feature_zshift_fit, feature_covariance):\n\n num_features = int((np.shape(parameters)[0]-1)/2)\n num_of_quasars = np.shape(parameters)[1]", "original_comment": " # empty arrays to be filled\n", "target_code": " feature_distance = np.zeros((num_of_quasars, num_features, len(zshifts)))\n", "project_metadata": {"full_name": "RichardsGroup/LSSTprep", "description": "Repository for Richards group LSST prep work, specifically related to the AGN SC", "topics": [], "git_url": "git://github.com/RichardsGroup/LSSTprep.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2018-06-20T20:43:08Z", "size": 30265, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 8424521, "Python": 6419}, "last_updated": "2020-09-28T18:32:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "plt.figure(figsize=(12, 12))\nplt.subplot(211)\nplt.title('Observed DCR Slopes vs. Redshift')\nplt.scatter(test_quasars['zspec'][sort_indices], test_quasars['u-slope']\n [sort_indices], color='red', label='True u slope')\nplt.plot(test_quasars['zspec'][sort_indices], obs_slopes_u_20[sort_indices],\n color='black', label='Observed u slope@20 obs', alpha=0.7)\nplt.\n", "model": "no-comments", "intent": " # empty arrays to be filled"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n#%%\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------", "original_comment": "# Create figure and dimensions\n", "target_code": "plt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\n", "project_metadata": {"full_name": "isaacarroyov/ss_plots", "description": "Repositorio de gr\u00e1ficas realizadas en Python para mis boletines de servicio social (Ecuaciones Diferenciales y An\u00e1lisis Vectorial) || Repository of the plots made in Python for my social service bulletins (Differential Equations and Vector Calculus)", "topics": ["differential-equations", "math", "vector-analysis", "university", "python3", "python", "ecuaciones-diferenciales"], "git_url": "git://github.com/isaacarroyov/ss_plots.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-08-27T19:15:30Z", "size": 21849, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 29758848}, "last_updated": "2020-11-24T18:53:41Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "fig = plt.figure()\nax = fig.add_subplot(111)\nax.set_facecolor(c_background) # change the color of the background\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\nplt\n", "model": "no-comments", "intent": "# Create figure and dimensions"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n#%%\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n#%%\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n#%%\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n#%%\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n#%%\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n#%%", "original_comment": "# The standard deviation of daily offence count\n", "target_code": "std_count = crime_by_date.groupBy() .agg(stddev(\"Count\")\n ) .withColumnRenamed(\"stddev_samp(Count)\", \"Standard Deviation\")\n", "project_metadata": {"full_name": "WaicongTam/Assignment-Portfolio", "description": "This repository is showcase of the codes of my assignments. All the assignments I consider worth sharing will be updated here right after the late penalty has reached 50%.", "topics": [], "git_url": "git://github.com/WaicongTam/Assignment-Portfolio.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2019-06-01T03:27:31Z", "size": 10261, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1789685, "Java": 101530}, "last_updated": "2020-10-15T15:22:21Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "crime_by_date_std = crimes_df.groupBy() .std(\n \"Count\") .withColumnRenamed(\"std(Count)\", \"STD\")\ncrime_by_date_std.show()\n", "model": "docstring", "intent": "# The standard deviation of daily offence count"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n\ndf['age'] = 2017 - df.yearbuilt\n\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n\ntrain.cluster_loc.value_counts()\n\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n\ntest.cluster_home.value_counts()\n\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\nless_significant_clusters\n\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n\ntrain.head()\n\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n\ntrain_no_clusters.head()\n\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n#%%\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n#%%\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n#%%\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n#%%\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n#%%\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n#%%\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n#%%\n\ndf['age'] = 2017 - df.yearbuilt\n\n#%%\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n#%%\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n#%%\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n#%%\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n#%%\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n#%%\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n#%%\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n#%%\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n#%%\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n#%%\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n#%%\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n#%%\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n#%%\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n#%%\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n#%%\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n#%%\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n#%%\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n#%%\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n#%%\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n#%%\n\ntrain.cluster_loc.value_counts()\n\n#%%\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n#%%\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n#%%\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n#%%\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n#%%\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n#%%\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n#%%\n\ntest.cluster_home.value_counts()\n\n#%%\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n#%%\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\nless_significant_clusters\n\n#%%\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n#%%\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n#%%\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n#%%\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n#%%\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n#%%\n\ntrain.head()\n\n#%%\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n#%%\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n#%%\n\ntrain_no_clusters.head()\n\n#%%\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n#%%\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n#%%\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n#%%\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n#%%\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n#%%\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n#%%\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n#%%\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n#%%\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n#%%\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#", "original_comment": "# ##### Linear Support Vector Regressor from sklearn.svm\n", "target_code": "regr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\n", "project_metadata": {"full_name": "CodeupClassroom/bayes-methodologies-exercises", "description": "Bayes exercises on methodologies", "topics": [], "git_url": "git://github.com/CodeupClassroom/bayes-methodologies-exercises.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2019-10-09T14:04:48Z", "size": 13779, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 17490873, "Python": 71621}, "last_updated": "2020-01-06T20:54:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X\n", "model": "no-comments", "intent": "# Linear Support Vector Regressor from sklearn.svm"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%", "original_comment": "# init the Support Vector Machine classifier\n", "target_code": "from sklearn.svm import SVC\n\nsvm = SVC(kernel='linear', random_state=random_seed)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "svm = SVC(kernel='linear')\n", "model": "natural", "intent": "# init the Support Vector Machine classifier"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n'''\nLSTM playground\n'''\n\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n\nbatchedInputMatrix.shape\n\n\nbatchedOutputMatrix.shape\n\n\nbatchedInputMatrix[0, 0]\n\n\nbatchedOutputMatrix[0, 0]\n\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n#%%\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n#%%\n\n'''\nLSTM playground\n'''\n\n#%%\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n#%%\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n#%%\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n#%%\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n#%%\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n#%%\n\nbatchedInputMatrix.shape\n\n#%%\n\nbatchedOutputMatrix.shape\n\n#%%\n\nbatchedInputMatrix[0, 0]\n\n#%%\n\nbatchedOutputMatrix[0, 0]\n\n#%%\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n#%%\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n#%%\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n#%%\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))", "original_comment": "# output layer\n", "target_code": "simpleStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n", "project_metadata": {"full_name": "miroenev/teach_DL", "description": null, "topics": [], "git_url": "git://github.com/miroenev/teach_DL.git", "stars": 36, "watchers": 36, "forks": 15, "created": "2017-07-19T18:01:29Z", "size": 98182, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12259106, "Python": 43930, "Dockerfile": 2478, "Shell": 1713}, "last_updated": "2020-09-04T16:13:54Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "simpleDenseModel.summary()\nsimpleNonStatefulModel.summary()\nsimpleStatefulModel.summary()\n", "model": "no-comments", "intent": "# add output layer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n#%%\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n#%%\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n#%%\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n#%%\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n#%%\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n#%%\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n#%%\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n#%%\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n#%%\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n#%%\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n#%%\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n#%%\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n#%%\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n#%%\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n#%%\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n#%%\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n#%%\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#", "original_comment": "# Function to scale input data (`X`) for predictive purposes.\n", "target_code": "from sklearn.preprocessing import StandardScaler\n\ndef dataset_scaler(training_data, testing_data, obj=StandardScaler):\n \"\"\" Function to scale X-data using custom input algorithm. \"\"\"\n SCALED_FEATURES = [feature + \"_sca\" for feature in training_data]\n scaler = obj()\n scaler.fit(training_data)\n X_train_sca = pd.DataFrame(scaler.transform(\n training_data).T, SCALED_FEATURES).T\n X_test_sca = pd.DataFrame(scaler.transform(\n testing_data).T, SCALED_FEATURES).T\n return X_train_sca, X_test_sca\n", "project_metadata": {"full_name": "AakashSudhakar/6nomads-interview-project", "description": "Interview project repository for data analysis and prediction for 6Nomads data. ", "topics": ["data-analysis", "data-processing", "data-science", "machine-learning", "data-structures"], "git_url": "git://github.com/AakashSudhakar/6nomads-interview-project.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2019-09-06T05:04:40Z", "size": 385, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 545554, "Python": 21164}, "last_updated": "2020-05-13T23:33:12Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "def scale_data(X, target):\n \"\"\" Function to scale input data (`X`) for predictive purposes. \"\"\"\n scaler = StandardScaler()\n scaler.fit(X)\n X = scaler.transform(X)\n return X, scaler\n", "model": "natural", "intent": "# Function to scale input data (`X`) for predictive purposes."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n\ndata.head()\n# look at the dataset\n\n\ndata.info()\n# basic info of dataset\n\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n\ndata.describe()\n\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n\ndata.columns\n\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n\n# Data Visulaization\n\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n\nsns.countplot(data['room_type'])\n\n\n# We can observe reduced preference in shared rooms\n\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n\n# heavily skewed\n\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n\ndata.corr()\n\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n\n# to plot locaation and price on NY city map\n\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)\nplt.figure(figsize=(12, 12), facecolor='black')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n#%%\n\ndata.head()\n# look at the dataset\n\n#%%\n\ndata.info()\n# basic info of dataset\n\n#%%\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n#%%\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n#%%\n\ndata.describe()\n\n#%%\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n#%%\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n#%%\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n#%%\n\ndata.columns\n\n#%%\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n#%%\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n#%%\n\n# Data Visulaization\n\n#%%\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n#%%\n\nsns.countplot(data['room_type'])\n\n#%%\n\n# We can observe reduced preference in shared rooms\n\n#%%\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n#%%\n\n# heavily skewed\n\n#%%\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n#%%\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n#%%\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n#%%\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n#%%\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n#%%\n\ndata.corr()\n\n#%%\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n#%%\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n#%%\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n#%%\n\n# to plot locaation and price on NY city map\n\n#%%\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n#%%\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n#%%\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n#%%\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n#%%\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)", "original_comment": "# Plot the counts\n", "target_code": "ax = plt.subplot(1, 1, 1)\nplt.imshow(img, 'hot')\nplt.axis('off')\nplt.tight_layout()\n", "project_metadata": {"full_name": "maheshd20/Da_project_sem5", "description": null, "topics": [], "git_url": "git://github.com/maheshd20/Da_project_sem5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-30T13:22:44Z", "size": 5278, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1470956}, "last_updated": "2020-11-30T15:37:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.countplot(x='bedrooms', data=data)\n", "model": "docstring", "intent": "# Plot the counts"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\nfrom extract_feature import FeatureExtractor\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n#%%\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()", "original_comment": "# ## Extract Feature\n", "target_code": "from extract_feature import FeatureExtractor\n\nFE = FeatureExtractor(model_name='xlm-r')\n", "project_metadata": {"full_name": "ilhamfp/indonesian-text-classification-multilingual", "description": "Improving Indonesian text classification using multilingual language model", "topics": ["multilingual-language-model", "text-classification", "indonesian-language", "indonesian-text-classification", "sentiment-analysis", "hate-speech-detection", "language-model", "multilingual", "zero-shot", "monolingual", "cross-lingual-transfer", "multilingual-language-models", "indonesian-data", "english-language"], "git_url": "git://github.com/ilhamfp/indonesian-text-classification-multilingual.git", "stars": 7, "watchers": 7, "forks": 0, "created": "2020-04-26T07:27:39Z", "size": 15604, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3476215, "Python": 28982}, "last_updated": "2020-12-20T17:12:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "X_train = feature_extraction.text.TfidfVectorizer(\n ngram_range=(1, 2), analyzer='char_wb').fit_transform(train['comment'])\ny_train = train['label'].values\n", "model": "docstring", "intent": "# Extract Feature"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n\nX_train.shape, Y_train.shape\n\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n#%%\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n#%%\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n#%%\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n#%%\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n#%%\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n#%%\n\nX_train.shape, Y_train.shape\n\n#%%\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n#%%\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n#%%\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n#%%\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n#%%\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n#%%\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n#%%\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n#%%\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n#%%\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n#%%\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n#%%\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n#%%\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n#%%\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n#%%\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n#%%\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n#%%\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n#%%\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n#%%\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n#%%\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n#%%\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n#%%\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n#%%\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n#%%\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n#%%\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n#%%\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n#%%", "original_comment": "# sample from the forecasted distribution\n", "target_code": "smpl = yhat.sample(100).numpy()\n", "project_metadata": {"full_name": "sinusgamma/multimodal_network", "description": "Mixture Density Network with Tensorflow Probability. Demonstrate the usefulness of multi-modal distribution outputs for neural networks.", "topics": [], "git_url": "git://github.com/sinusgamma/multimodal_network.git", "stars": 11, "watchers": 11, "forks": 0, "created": "2020-03-08T10:08:43Z", "size": 3194, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1217660}, "last_updated": "2021-01-04T15:29:04Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "f = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n", "model": "natural", "intent": "# sample from the forecasted distribution"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))", "original_comment": "# Make a copy in the specific subfolder\n", "target_code": "df_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "df_log_new = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min]\ndf_log_new.tail()\n", "model": "no-comments", "intent": "# Make a copy in the specific subfolder"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n#%%\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker.estimator import Estimator\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n#%%\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)", "original_comment": "# #### Submit the job\n", "target_code": "from sagemaker.estimator import Estimator\n\nestimator = Estimator(role=role,\n train_instance_count=n_instances,\n train_instance_type=instance_type,\n image_name=image_name,\n hyperparameters=hyperparameters)\nestimator.fit()\n", "project_metadata": {"full_name": "daniel-fudge/DRL-Portfolio-Optimization-Custom", "description": "A portfolio optimization framework leveraging Deep Reinforcement Learning (DRL) and a custom trading environment", "topics": [], "git_url": "git://github.com/daniel-fudge/DRL-Portfolio-Optimization-Custom.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-06-12T22:27:29Z", "size": 35064, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1170339, "Python": 39958, "Shell": 4637}, "last_updated": "2020-11-01T22:06:49Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "get_ipython().run_cell_magic('time', '',\n 'import boto3\\nfrom time import gmtime, strftime\\n\\ns3 = boto3.client(\\'s3\\')\\n# create unique job name \\njob_name_prefix = \\'portfolio-optimization\\'\\ntimestamp = time.strftime(\\'-%Y-%m-%d-%H-%M-%S\\', time.gmtime())\\njob_name = job_name_prefix + timestamp\\ntraining_params = \\\\\\n{\\n # specify the training docker image\\n \"AlgorithmSpec\n", "model": "no-comments", "intent": "# Submit the job"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n\n# print the type of an object\ntype(df)\n\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n\n# list the index of the DataFrame\ndf.index\n\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n\n# print first 5 rows\ny.head()\n\n\n# print type of the ts object\ntype(y)\n\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n#%%\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n#%%\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n#%%\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n#%%\n\n# print the type of an object\ntype(df)\n\n#%%\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n#%%\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n#%%\n\n# list the index of the DataFrame\ndf.index\n\n#%%\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n#%%\n\n# print first 5 rows\ny.head()\n\n#%%\n\n# print type of the ts object\ntype(y)\n\n#%%\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n#%%\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n#%%\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n#%%\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n#%%", "original_comment": "# ask Yahoo! Finance for data\n", "target_code": "corporate_actions = web.DataReader(tickers, 'yahoo-actions', start, end)\n", "project_metadata": {"full_name": "dacatay/time-series-analysis", "description": "Presentation for time series analysis", "topics": [], "git_url": "git://github.com/dacatay/time-series-analysis.git", "stars": 41, "watchers": 41, "forks": 53, "created": "2017-09-08T13:45:56Z", "size": 43990, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12370243, "R": 4829}, "last_updated": "2020-11-05T10:34:15Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "panel = web.DataReader(tickers, provider, start, end)\nprint(panel['Adj Close'])\n", "model": "docstring", "intent": "# ask Yahoo! Finance for data"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n#%%\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n#%%\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n#%%\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n#%%\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n#%%\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n#%%\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n#%%\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n#%%\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n#%%\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n#%%\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n#%%\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()", "original_comment": "# ### Use a different marker for the hue levels:\n", "target_code": "palette = dict(Lunch=\"blue\", Dinner=\"red\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\", palette=palette,\n hue_order=[\"Dinner\", \"Lunch\"],\n hue_kws=dict(marker=[\"^\", \"v\"]))\n", "project_metadata": {"full_name": "scidatmath2020/Ciencia-de-datos-con-Python", "description": null, "topics": [], "git_url": "git://github.com/scidatmath2020/Ciencia-de-datos-con-Python.git", "stars": 20, "watchers": 20, "forks": 27, "created": "2020-09-07T20:49:59Z", "size": 20544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5705341, "Python": 12821}, "last_updated": "2020-11-19T22:06:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "sns.hls_palette(10)\n", "model": "docstring", "intent": "# Use a different marker for the hue levels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n\ntype(my_decimal)\n\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n#%%\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n#%%\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n#%%\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n#%%\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n#%%\n\ntype(my_decimal)\n\n#%%\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n#%%\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n#%%\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n#%%\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#", "original_comment": "# Lists can also be added to:\n", "target_code": "my_number_list.append(6)\n", "project_metadata": {"full_name": "wrightaprilm/CompBio2018", "description": null, "topics": [], "git_url": "git://github.com/wrightaprilm/CompBio2018.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2018-08-14T16:08:48Z", "size": 8976, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4697133, "TeX": 4884, "Python": 4399}, "last_updated": "2019-06-27T20:53:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "my_number_list.append(6)\nmy_number_list\n", "model": "natural", "intent": "# add element to list"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n\ntype(raw_text)\n\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n\nraw_text[:100]\n\n\n# ## Tokenization\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n#%%\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom nltk import word_tokenize\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n#%%\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n#%%\n\ntype(raw_text)\n\n#%%\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n#%%\n\nraw_text[:100]\n\n\n# ## Tokenization", "original_comment": "# Turning the text into words using the nltk word_tokenizer\n", "target_code": "from nltk import word_tokenize\n\nwords_text = word_tokenize(raw_text)\n", "project_metadata": {"full_name": "derekjjackson/DH_PythonLibraries_JupyterNotebooks", "description": "FIles and resources for using Data Science, Python, and Jupyter Notebooks in the practice of Digital Humanities", "topics": [], "git_url": "git://github.com/derekjjackson/DH_PythonLibraries_JupyterNotebooks.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2018-10-20T15:06:33Z", "size": 29200, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 10076299}, "last_updated": "2020-12-25T21:05:12Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "tokens = nltk.word_tokenize(raw_text)\ntokens\n", "model": "natural", "intent": "# Turning the text into words using the nltk word_tokenizer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Discretisation with k-means clustering\n#\n# This discretisation method consists in applying k-means clustering to the continuous variable.\n#\n# Briefly, the algorithm works as follows:\n#\n# - 1) Initialization: random creation of K centers\n# - 2) Each data point is associated with the closest center\n# - 3) Each center position is re-computed as the center of its associated points\n#\n# Steps 2 and 3 are repeated until convergence is reached. The algorithm minimises the pairwise squared deviations of points within the same cluster.\n#\n# More details about k-means [here](https://en.wikipedia.org/wiki/K-means_clustering)\n#\n# Nice blog with graphical explanation of k-means [here](https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0)\n#\n# Note that the user, needs to define the number of clusters, as with equal width and equal frequency discretisation.\n#\n# ## Opinion of the instructor\n#\n# I personally don't see how this technique is different from equal width discretisation, when the variables are continuous throughout the value range. Potentially it would make a different if the values were arranged in real clusters.\n#\n# So my recommendation is, unless you have reasons to believe that the values of the variable are organised in clusters, then use equal width discretisation as an alternative to this method.\n#\n#\n# ## In this demo\n#\n# We will learn how to perform k-means discretisation using the Titanic dataset and Scikit-learn\n\n# ## Titanic dataset\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\n\nfrom sklearn.preprocessing import KBinsDiscretizer\n\n#%%\n\n# load the numerical variables of the Titanic Dataset\n\ndata = pd.read_csv('../titanic.csv',\n usecols=['age', 'fare', 'survived'])\n\ndata.head()\n\n#%%", "original_comment": "# Let's separate into train and test set\n", "target_code": "from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n data[['age', 'fare']],\n data['survived'],\n test_size=0.3,\n random_state=0)\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "X_train, X_test, y_train, y_test = train_test_split(\n data[['age', 'fare','survived']], data.survived, test_size=0.3, random_state=0)\nX_train.shape, X_test.shape\n", "model": "natural", "intent": "# Let's separate into train and test set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n\nfeatures = iris_df.values\n\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n\n\n\n\n# ### Split the data into training and testing data, with random seeding\n\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n#%%\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n#%%\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n#%%\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n#%%\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n#%%\n\nfeatures = iris_df.values\n\n#%%\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n#%%\n\n\n\n#%%\n\n# ### Split the data into training and testing data, with random seeding\n\n#%%\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)", "original_comment": "# ### Creating an instance of LogisticRegrssion class\n", "target_code": "from sklearn.linear_model import LogisticRegression\n\nlogReg = LogisticRegression()\n", "project_metadata": {"full_name": "naveen21553/ml-workshop", "description": "Machine Learning Workshop Resources", "topics": [], "git_url": "git://github.com/naveen21553/ml-workshop.git", "stars": 12, "watchers": 12, "forks": 14, "created": "2018-09-28T15:03:08Z", "size": 5274, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 685393, "Python": 11705}, "last_updated": "2020-10-11T10:46:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "clf = tree.DecisionTreeClassifier()\nclf = clf.fit(x_train, y_train)\n", "model": "no-comments", "intent": "# Creating an instance of LogisticRegrssion class"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n#%%\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n#%%\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n#%%\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n#%%\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n#%%\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)", "original_comment": "# Reshape validation data so that model can be run\n", "target_code": "X_valid = X_valid.values.reshape(-1, 6)\n", "project_metadata": {"full_name": "abhishek3aj/ML1819--task-101--team-06", "description": "ML framework comparison", "topics": [], "git_url": "git://github.com/abhishek3aj/ML1819--task-101--team-06.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-10-09T09:48:20Z", "size": 21107, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4638466, "Python": 84406}, "last_updated": "2018-12-17T19:27:23Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "X_valid = X_valid.reshape(X_valid.shape[0], 1, X_valid.shape[1])\nX_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])\nlrm.score(X_valid, y_valid)\n", "model": "docstring", "intent": "# Reshape validation data"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n#%%\n\nimport seaborn as sns", "original_comment": "# This makes a white background with grid lines\n", "target_code": "import seaborn as sns\n\nsns.set_style(\"whitegrid\")\n", "project_metadata": {"full_name": "emonson/pandas-datamatters", "description": "Python for Tabular Data and Visualization \u2013 Data Matters 2020", "topics": [], "git_url": "git://github.com/emonson/pandas-datamatters.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-12-02T18:35:22Z", "size": 5862, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1794056}, "last_updated": "2021-01-05T16:21:04Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "background = matplotlib.colors.white_bg\n", "model": "docstring", "intent": "# Make a white background with grid lines"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n#%%\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n#%%\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n#%%\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n#%%\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n#%%\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n#%%\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n#%%\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n#%%\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)", "original_comment": "# ## Prepare a train/test set for Validating Stacking\n", "target_code": "from sklearn.model_selection import train_test_split\n\ntrain_ = train.fillna(-999)\ntest_ = test.fillna(-999)\ntraining, testing = train_test_split(train_, test_size=0.2, random_state=42)\n", "project_metadata": {"full_name": "liujiashen9307/KaggleCompetition", "description": "Code hub for the kaggle competitions I have participated in.", "topics": [], "git_url": "git://github.com/liujiashen9307/KaggleCompetition.git", "stars": 6, "watchers": 6, "forks": 10, "created": "2016-10-12T21:10:54Z", "size": 15258, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16811198, "HTML": 14162298, "Python": 1658600, "R": 8306}, "last_updated": "2020-02-01T03:33:11Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "compatibility": "Agree", "compatibility-score": 2, "precision": "Agree", "precision-score": 2}], "predicted_code": "X_train, X_test, y_train, y_test = train_test_split(\n train[ReducedVar], train['interest_level'], test_size=0.2, random_state=42)\n", "model": "docstring", "intent": "# Prepare a train/test set for Validating Stacking"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nplt.imshow(img1)\n\n\nplt.imshow(img2)\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\n# blending images of same size\n\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nlarge_img = img1\nsmall_img = img2\n\n\nx_offset = 0\ny_offset = 0\n\n\n# in numpy x axis is vertical and y axis is horizontal\n\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n\n# Blend images of different sizes\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nimg1.shape\n\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n\nimg2.shape\n\n\nrows, cols, channels = img2.shape\n\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n\nmask_inv.shape\n\n\n# you can see the image is 2D now\n\n\n\n\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n\nplt.imshow(white_bgd)\n\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n\nlarge_img = img1\nsmall_img = final_roi\n\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n\n\n\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n\nret\n\n\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\nshow_img(img)\n\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n\n\n\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\ni = load_img()\nshow_img(i)\n\n\ngamma = 1/4\n\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n\nimg = load_img()\nshow_img(img)\n\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n\nimg = load_img()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n#%%\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n#%%\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n#%%\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n#%%\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nplt.imshow(img1)\n\n#%%\n\nplt.imshow(img2)\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\n# blending images of same size\n\n#%%\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n#%%\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nlarge_img = img1\nsmall_img = img2\n\n#%%\n\nx_offset = 0\ny_offset = 0\n\n#%%\n\n# in numpy x axis is vertical and y axis is horizontal\n\n#%%\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n#%%\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n#%%\n\n# Blend images of different sizes\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nimg1.shape\n\n#%%\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n#%%\n\nimg2.shape\n\n#%%\n\nrows, cols, channels = img2.shape\n\n#%%\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n#%%\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n#%%\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n#%%\n\nmask_inv.shape\n\n#%%\n\n# you can see the image is 2D now\n\n#%%\n\n\n\n#%%\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n#%%\n\nplt.imshow(white_bgd)\n\n#%%\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n#%%\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n#%%\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n#%%\n\nlarge_img = img1\nsmall_img = final_roi\n\n#%%\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n#%%\n\n\n\n#%%\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n#%%\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n#%%\n\nret\n\n#%%\n\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\nshow_img(img)\n\n#%%\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n#%%\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n#%%\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n#%%\n\n\n\n#%%\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\ni = load_img()\nshow_img(i)\n\n#%%\n\ngamma = 1/4\n\n#%%\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n#%%\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n#%%\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n#%%\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n#%%\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n#%%\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n#%%\n\nimg = load_img()\nshow_img(img)\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n#%%\n\nimg = load_img()", "original_comment": "# creating white noise\n", "target_code": "white_noise = np.random.randint(0, 2, size=(600, 600))\n", "project_metadata": {"full_name": "RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning", "description": null, "topics": [], "git_url": "git://github.com/RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning.git", "stars": 3, "watchers": 3, "forks": 5, "created": "2019-05-28T02:31:41Z", "size": 48363, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 98148466, "Python": 466}, "last_updated": "2020-12-21T09:24:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "blank_img = np.zeros((600, 600))\nfont = cv2.FONT_HERSHEY_SIMPLEX\ncv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\nshow_img(blank_img)\nkernel = np.ones((5, 5), dtype=np.uint8)\nresult = cv2.dilate(img, kernel, iterations=1)\nshow_img(result)\nresult = cv2.dilate(img, kernel, iterations=4)\n", "model": "natural", "intent": "# creating white noise"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n#%%\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):", "original_comment": " # make sure output directory exists\n", "target_code": " output_dir = pathlib.Path(output_dir)\n output_dir.mkdir(exist_ok=True)\n", "project_metadata": {"full_name": "thehackerwithin/illinois", "description": "THW Chapter at U. Illinois", "topics": [], "git_url": "git://github.com/thehackerwithin/illinois.git", "stars": 13, "watchers": 13, "forks": 31, "created": "2015-02-18T19:38:33Z", "size": 61361, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 868658, "TeX": 34143, "R": 18922, "HTML": 10291, "Julia": 5254, "Python": 4028, "C++": 425, "CMake": 94}, "last_updated": "2020-09-30T18:16:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "if not pathlib.exists(output_dir):\n os.makedirs(output_dir)\n", "model": "docstring", "intent": " # make sure output directory exists"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n\n# Check shape.\ncaravan_df_raw.shape\n\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n#%%\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n#%%\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n#%%\n\n# Check shape.\ncaravan_df_raw.shape\n\n#%%\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n#%%\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n#%%\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n#%%\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n#%%\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n#%%\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n#%%\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n#%%", "original_comment": "# Look globally at correlation of features.\n", "target_code": "corr = caravan_df_raw.corr()\n", "project_metadata": {"full_name": "jonrossi/caravan-insurance", "description": "Exploration and analysis of the Caravan Insurance dataset", "topics": [], "git_url": "git://github.com/jonrossi/caravan-insurance.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-09-23T17:40:57Z", "size": 951, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1260942}, "last_updated": "2020-10-31T21:58:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "caravan_df_raw.corr()\n", "model": "natural", "intent": "# Look globally at correlation of features."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n\n', '.join([cat for cat in fs_categories])\n\n\ncinema = df_cinemas.loc[0]\n\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n#%%\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n#%%\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n#%%\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n#%%\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n#%%\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n#%%\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n#%%\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n#%%\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n#%%\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n#%%\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n#%%\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n#%%\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n#%%\n\n', '.join([cat for cat in fs_categories])\n\n#%%\n\ncinema = df_cinemas.loc[0]\n\n#%%\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n#%%\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n#%%\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n#%%\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n#%%\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n#%%\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation", "original_comment": "# Check the cinemas dataset contains any duplicated address\n", "target_code": "duplicated = df_cinemas.duplicated('Address', keep=False)\n", "project_metadata": {"full_name": "meghsat/CourseraIBMdatascience_course", "description": "In this repo consists of the projects I had done as part of the coursera's IBM data science Professional certificate.", "topics": [], "git_url": "git://github.com/meghsat/CourseraIBMdatascience_course.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-04-08T05:37:45Z", "size": 4855, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 14626378}, "last_updated": "2020-05-28T09:51:40Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "df_cinemas_nearby = venues_nearby(\n cinema['Latitude'], cinema['Longitude'], 'Cinema', verbose=True)\n", "model": "no-comments", "intent": "# Check the cinemas dataset contains any duplicated address"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n#%%\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n#%%\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n#%%\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n#%%\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n#%%\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n#%%\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n#%%\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed", "original_comment": "# Write to disk\n", "target_code": "df_Diffusion.to_csv(os.path.join(data_subdirectory, 'df_Diffusion.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_Diffusion.to_csv('data/df_diffusion.csv')\n", "model": "docstring", "intent": "# Write to disk"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n\n# View one of the records\ndata.take(1)\n\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n\nlr_model = lr.fit(train)\n\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n#%%\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n#%%\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n#%%\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n#%%\n\n# View one of the records\ndata.take(1)\n\n#%%\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n#%%\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n#%%\n\nlr_model = lr.fit(train)\n\n#%%\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n#%%\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n#%%\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n#%%\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n#%%", "original_comment": "# Area under the ROC\n", "target_code": "print(\"Area Under ROC = %.2f\" % metrics.areaUnderROC)\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "lr_model.explainParams()\n", "model": "natural", "intent": "# Area under the ROC"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_moons, make_circles\nimport tensorflow as tf\nimport numpy as np\nimport math\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\n#%%", "original_comment": "# Set notebook to full width\n", "target_code": "from IPython.core.display import display, HTML\ndisplay(HTML(\"\"))\n", "project_metadata": {"full_name": "mmarius/nnia-tutorial", "description": "Repository for my tutorial group which is part of the lecture Neural Networks: Implementation and Application", "topics": [], "git_url": "git://github.com/mmarius/nnia-tutorial.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2017-11-02T15:20:51Z", "size": 12430, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1494110}, "last_updated": "2020-05-07T22:34:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "plt.rcParams['figure.figsize'] = (12, 8)\n", "model": "natural", "intent": "# Set notebook to full width"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')\n\n\n\ngender_amt = pd.DataFrame(fraud_df.head(100), columns=['amt', 'gender'])\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n#%%\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n#%%\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')", "original_comment": "# ### Boxplots of Amount by Gender\n", "target_code": "plt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', x='gender', data=gender_amt, hue='gender',\n dodge=False, width=0.6, palette='Set2')\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "compatibility": "Disagree", "compatibility-score": 1, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1}], "predicted_code": "sns.boxplot(y='amt', data=fraud_df.tail(100), width=0.4, color='mediumpurple')\ngender_amt = pd.DataFrame(fraud_df.tail(100), columns=['amt', 'gender'])\n", "model": "docstring", "intent": "# Boxplots of Amount by Gender"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data\n\n\n\ndef MinMaxScaler(data):\n ''' Min Max Normalization\n Parameters\n ----------\n data : numpy.ndarray\n input data to be normalized\n shape: [Batch size, dimension]\n Returns\n ----------\n data : numpy.ndarry\n normalized data\n shape: [Batch size, dimension]\n References\n ----------\n .. [1] http://sebastianraschka.com/Articles/2014_about_feature_scaling.html\n '''\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n#%%\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n#%%\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data", "original_comment": "# ### MinMax Scaling\n", "target_code": " import numpy as np\n\n numerator = data - np.min(data, 0)\n denominator = np.max(data, 0) - np.min(data, 0)\n return numerator / (denominator + 1e-7)\n", "project_metadata": {"full_name": "jwlee-ml/TensorFlow_Training_13th", "description": "Tensorflow\ub85c \uc2dc\uc791\ud558\ub294 \ub525\ub7ec\ub2dd Camp 13\uae30 \uc2e4\uc2b5", "topics": [], "git_url": "git://github.com/jwlee-ml/TensorFlow_Training_13th.git", "stars": 4, "watchers": 4, "forks": 5, "created": "2019-06-14T14:39:05Z", "size": 23519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23325250}, "last_updated": "2019-11-05T13:31:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "dataset = np.loadtxt('./data/pima-indians-diabetes.csv', delimiter=\",\")\nX = dataset[:, 0:8]\nY = dataset[:, 8]\nscaler = MinMaxScaler(X)\nX = scaler.fit_transform(X)\nX_train, X_test, Y_train, Y_test = train_test_split(\n X, Y, test_size=0.33, random_state=42)\nX_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)\nX_test = X_\n", "model": "no-comments", "intent": " # MinMax Scaling"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n#%%\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n#%%\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n#%%\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n#%%\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n#%%\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#", "original_comment": "# To compute the Frobenius norm:\n", "target_code": "A_frobenius = np.linalg.norm(A, 'fro')\n", "project_metadata": {"full_name": "garth-wells/notebooks-3M1", "description": "Jupyter notebooks (Python) for the course 3M1 at the Department of Engineering, University of Cambridge", "topics": ["linear-algebra", "singular-value-decomposition", "regression"], "git_url": "git://github.com/garth-wells/notebooks-3M1.git", "stars": 10, "watchers": 10, "forks": 18, "created": "2015-01-12T22:32:25Z", "size": 128315, "license": "bsd-2-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7472485}, "last_updated": "2021-01-04T10:34:46Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "A = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(np.linalg.norm(A, np.inf))\nprint(np.linalg.norm(A, 1))\nprint(np.linalg.norm(A, 2))\nprint(np.linalg.norm(A, np.inf))\n", "model": "no-comments", "intent": "# To compute the Frobenius norm:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_moons, make_circles\nimport tensorflow as tf\nimport numpy as np\nimport math\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\n#%%", "original_comment": "# Set notebook to full width\n", "target_code": "from IPython.core.display import display, HTML\ndisplay(HTML(\"\"))\n", "project_metadata": {"full_name": "mmarius/nnia-tutorial", "description": "Repository for my tutorial group which is part of the lecture Neural Networks: Implementation and Application", "topics": [], "git_url": "git://github.com/mmarius/nnia-tutorial.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2017-11-02T15:20:51Z", "size": 12430, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1494110}, "last_updated": "2020-05-07T22:34:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "plt.rc(\"font\", size=14)\n", "model": "docstring", "intent": "# Set notebook to full width"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n#%%\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n#%%\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n#%%\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n#%%\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n#%%\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n#%%", "original_comment": "# The standard deviation of daily offence count\n", "target_code": "std_count = crime_by_date.groupBy() .agg(stddev(\"Count\")\n ) .withColumnRenamed(\"stddev_samp(Count)\", \"Standard Deviation\")\n", "project_metadata": {"full_name": "WaicongTam/Assignment-Portfolio", "description": "This repository is showcase of the codes of my assignments. All the assignments I consider worth sharing will be updated here right after the late penalty has reached 50%.", "topics": [], "git_url": "git://github.com/WaicongTam/Assignment-Portfolio.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2019-06-01T03:27:31Z", "size": 10261, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1789685, "Java": 101530}, "last_updated": "2020-10-15T15:22:21Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "std_count = crime_by_date.groupBy() .std(\n \"Count\") .withColumnRenamed(\"std(Count)\", \"Standard Deviation\")\nstd_count.show()\n", "model": "natural", "intent": "# The standard deviation of daily offence count"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n#%%\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n#%%\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n#%%\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n#%%\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n#%%", "original_comment": "# Check if there is any null information anywhere\n", "target_code": "train.isnull().any().any()\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print(train.isnull().sum())\n", "model": "natural", "intent": "# Check if there is any null information anywhere"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n#%%\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n#%%\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n#%%\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n#%%\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n#%%\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n#%%\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n#%%\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n#%%\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n#%%\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n#%%\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n#%%\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n#%%\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n#%%\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n#%%\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n#%%\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n#%%\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n#%%\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n#%%\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n#%%\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n#%%\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n#%%\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n#%%", "original_comment": "# Histograms of test input data by column\n", "target_code": "fig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_test[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "input_train.to_parquet(os.path.join(\n output_dir, \"microphysics_train.parquet\"), engine=\"pyarrow\")\noutput_train.to_parquet(os.path.join(\n output_dir, \"microphysics_test.parquet\"), engine=\"pyarrow\")\nmeta_train.to_parquet(os.path.join(\n output_dir, \"microphysics_meta.parquet\"), engine=\"pyarrow\")\n", "model": "no-comments", "intent": "# Histograms of test input data by column"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]\n\n\n\ndef print_hw_parameters(model):\n alpha, beta, gamma = model.params['smoothing_level'], model.params[\n 'smoothing_slope'], model.params['smoothing_seasonal']\n print(\"Holt-Winters parameters:\")\n print(\"Alpha: \", alpha)\n print(\"Beta: \", beta)\n print(\"Gamma: \", gamma)\n\n\nprint(\"Forecasting started...\")\nstart_time = time.time()\n\ntry:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n#%%\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n#%%\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n#%%\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n#%%\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n#%%\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n#%%\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]", "original_comment": "# ### Fit and predict Time Serie\n", "target_code": " model = hw.ExponentialSmoothing(\n ts_train, seasonal='additive', seasonal_periods=train_data_length-1).fit()\n predictions = model.predict(start=ts_test.index[0], end=ts_test.index[-1])\n", "project_metadata": {"full_name": "CSIRT-MU/QoSForecastLSTM", "description": "An evaluation of QoS forecast methods described in paper Quality of Service Forecasting with LSTM Neural Networks", "topics": ["publication"], "git_url": "git://github.com/CSIRT-MU/QoSForecastLSTM.git", "stars": 4, "watchers": 4, "forks": 2, "created": "2018-09-05T07:37:36Z", "size": 10237, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16021131}, "last_updated": "2020-03-27T12:49:41Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model.fit(ts_train)\nend_time = time.time()\nprint(\"Training time: \", end_time - start_time)\n", "model": "docstring", "intent": " # Fit and predict Time Serie"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n#%%\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')", "original_comment": "# normalize the format of DATE\n", "target_code": "train_target['Date'] = pd.to_datetime(train_target['Date'])\n", "project_metadata": {"full_name": "Quan-Sun/TADPOLE-ECE5970", "description": "machine learning with biomedical data", "topics": [], "git_url": "git://github.com/Quan-Sun/TADPOLE-ECE5970.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-11-16T21:39:24Z", "size": 15650, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5564392}, "last_updated": "2019-04-19T22:32:32Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train_target['DATE'] = pd.to_datetime(\n train_target['DATE'], format='%Y-%m-%d %H:%M:%S')\n", "model": "natural", "intent": "# normalize the format of DATE"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n\ncorpus = train_df.bow\n\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n#%%\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n#%%\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n#%%\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n#%%\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n#%%\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n#%%\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n#%%\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n#%%\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n#%%\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n#%%\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n#%%\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n#%%\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n#%%\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n#%%\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n#%%\n\ncorpus = train_df.bow\n\n#%%\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n#%%\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n#%%\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n#%%\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n#%%\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n#%%\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n#%%\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n#%%\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n#%%\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n#%%\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n#%%\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n#%%\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n#%%\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n#%%\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n#%%\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n#%%\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n#%%\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n#%%\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n#%%\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n#%%\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n#%%\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n#%%\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n#%%\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n#%%\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n#%%\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n#%%\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n#%%\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n#%%\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n#%%\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n#%%\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n#%%\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n#%%\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n#%%\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])", "original_comment": "# Transform the dates from strings to integers\n", "target_code": "for i in range(0, len(date_ads)):\n date_ads[i] = int(date_ads[i])\n", "project_metadata": {"full_name": "NewsEye/NLP-Notebooks-Newspaper-Collections", "description": "A collection of notebooks for Natural Language Processing", "topics": ["lda", "topic-modeling", "shannon", "nlp-notebooks", "digital-humanities", "newspaper-collections", "newspaper-clippings", "text-classification", "similarity"], "git_url": "git://github.com/NewsEye/NLP-Notebooks-Newspaper-Collections.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-07-06T11:18:13Z", "size": 12866, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4306857}, "last_updated": "2020-12-01T08:54:40Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "date_ads = pd.to_datetime(date_ads)\ndate_ads = pd.DataFrame(date_ads, columns=['date'])\ndate_ads.head()\n", "model": "natural", "intent": "# Transform the dates from strings to integers"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n\ncr3.sort_values('index', inplace=True)\n\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n#%%\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n#%%\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n#%%\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n#%%\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n#%%\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n#%%\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n#%%\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n#%%\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n#%%\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n#%%\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n#%%\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n#%%\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n#%%\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n#%%\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n#%%\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n#%%\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n#%%\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n#%%\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n#%%\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n#%%\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n#%%\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n#%%\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n#%%\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n#%%\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n#%%\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n#%%\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n#%%\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n#%%\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n#%%\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n#%%\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n#%%\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n#%%\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n#%%\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n#%%\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n#%%\n\ncr3.sort_values('index', inplace=True)\n\n#%%\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n#%%\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)", "original_comment": "# Cluster and plot\n", "target_code": "g = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='Reds')\nplt.show()\n", "project_metadata": {"full_name": "illdopejake/Hippocampus_AP_Axis", "description": "Code used for Hippocampus Anterior/Posterior gene expression and neuroimaging analyses ", "topics": [], "git_url": "git://github.com/illdopejake/Hippocampus_AP_Axis.git", "stars": 7, "watchers": 7, "forks": 1, "created": "2018-05-20T18:18:47Z", "size": 149297, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 20340748, "Python": 58444, "Shell": 2454}, "last_updated": "2020-12-20T09:17:56Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='index', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.\n", "model": "no-comments", "intent": "# Cluster and plot"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n#%%\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n#%%\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n#%%\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n#%%\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n#%%\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n#%%\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n#%%\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n#%%\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n#%%\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n#%%\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n#%%\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n#%%\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n#%%\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n#%%\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n#%%\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n#%%\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n#%%\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n#%%\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n#%%\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n#%%\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n#%%\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))", "original_comment": "# Add output layer\n", "target_code": "M1.add(Dense(1, activation='sigmoid'))\n", "project_metadata": {"full_name": "insight-decentralized-consensus-lab/TrustKeeper", "description": "A fraud prevention system for Peer-to-Peer transaction networks (Jahir M Gutierrez)", "topics": [], "git_url": "git://github.com/insight-decentralized-consensus-lab/TrustKeeper.git", "stars": 8, "watchers": 8, "forks": 7, "created": "2018-09-28T20:15:21Z", "size": 10845, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 614132, "Scala": 19178, "Python": 14102}, "last_updated": "2020-03-18T22:55:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "M1.compile(loss='categorical_crossentropy',\n optimizer='adam', metrics=['accuracy'])\n", "model": "no-comments", "intent": "# Add output layer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n#%%\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n#%%\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n#%%\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n#%%\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n#%%\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n#%%\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n#%%\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n#%%\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n#%%\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n#%%\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n#%%\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n#%%\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n#%%\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n#%%\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n#%%\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n#%%\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n#%%\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n#%%\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n#%%\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n#%%\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n#%%\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))", "original_comment": "# Add output layer\n", "target_code": "M1.add(Dense(1, activation='sigmoid'))\n", "project_metadata": {"full_name": "insight-decentralized-consensus-lab/TrustKeeper", "description": "A fraud prevention system for Peer-to-Peer transaction networks (Jahir M Gutierrez)", "topics": [], "git_url": "git://github.com/insight-decentralized-consensus-lab/TrustKeeper.git", "stars": 8, "watchers": 8, "forks": 7, "created": "2018-09-28T20:15:21Z", "size": 10845, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 614132, "Scala": 19178, "Python": 14102}, "last_updated": "2020-03-18T22:55:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "M1.add(Dense(1, activation='sigmoid'))\n", "model": "natural", "intent": "# Add output layer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n#%%\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport time\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n#%%\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n#%%\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n#%%\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n#%%\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))", "original_comment": "# So we can look at the progress on Tensorboard\n", "target_code": "import time\n\ncallback = keras.callbacks.TensorBoard(\n log_dir='./logs/inception/2/{}'.format(time.time())\n)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2, "coverage": "Disagree", "coverage-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "keras.callbacks.tensorboard.set_model(inception)\n", "model": "docstring", "intent": "# Enable callback to be able to look at the progress on Tensorboard"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n\nout\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n\ntrain, validation = get_MNIST_data()\n\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n#%%\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n#%%\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n#%%\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n#%%\n\nout\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n#%%\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n#%%\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n#%%\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n#%%", "original_comment": "# Shifted dataset\n", "target_code": "train_20, validation_20 = get_MNIST_data(shift=20)\ntrain_20 = rescale(train_20)\nvalidation_20 = rescale(validation_20)\n", "project_metadata": {"full_name": "elahea2020/6.036", "description": "Homework solutions of Intro to ML course at MIT Spring 2018", "topics": ["ml", "machine-learning", "machine-learning-algorithms", "mit", "6036", "perceptron-learning-algorithm", "rnn"], "git_url": "git://github.com/elahea2020/6.036.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2018-05-08T21:21:54Z", "size": 65530, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 18939819, "Python": 168769}, "last_updated": "2020-10-25T08:09:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "def shift(data):\n X, y = data\n X = X/255.\n return (X, y)\ntrain = shift(train)\nvalidation = shift(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\narch =\n", "model": "natural", "intent": "# Shifted dataset"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)\nplt.legend()\nplt.savefig('lumpsum.png')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n#%%\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n#%%\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n#%%\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n#%%\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n#%%\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)", "original_comment": "# Show with Legend\n", "target_code": "plt.show()\n", "project_metadata": {"full_name": "eonofrey/DollarCostAverage_vs._LumpSum", "description": "Comparing dollar cost averaging vs. lump sum investment in the SPY ", "topics": [], "git_url": "git://github.com/eonofrey/DollarCostAverage_vs._LumpSum.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-06-19T21:58:51Z", "size": 1525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 587938}, "last_updated": "2020-12-19T01:53:56Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.legend()\nplt.show()\n", "model": "docstring", "intent": "# Show with Legend"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n\ndata['sex'].unique()\n\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n#%%\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n#%%\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n#%%\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n#%%\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n#%%\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n#%%\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n#%%\n\ndata['sex'].unique()\n\n#%%\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n#%%\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n#%%\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n#%%\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()", "original_comment": "# and capture it into a dataframe\n", "target_code": "prob_df = pd.DataFrame(prob_df)\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "for var in ['cabin','sex', 'embarked']:\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n", "model": "no-comments", "intent": "# capture it into a dataframe"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn import preprocessing\nfrom sklearn.feature_selection import RFE\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.preprocessing import scale\nfrom sklearn.metrics import roc_auc_score\nimport time\n\n#%%\n\ndef encode(data, col, max_val):\n data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)\n data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)\n return data\n\n#%%\n\n# Read in data from small csv to a dataframe\ndf1 = pd.read_csv('weather_large.csv', sep=',')\n\n# Reformat data in date/time column\ndf1['Formatted Date'] = pd.to_datetime(df1['Formatted Date'])\n\n# Create a new column for year / month / hour\ndf1['Year'] = pd.DatetimeIndex(df1['Formatted Date']).year\ndf1['Month'] = pd.DatetimeIndex(df1['Formatted Date']).month\ndf1['Hour'] = pd.DatetimeIndex(df1['Formatted Date']).hour\n\n# Encode month and hour for cyclical nature\ndf1 = encode(df1, 'Month', 13)\ndf1 = encode(df1, 'Hour', 23)\ndf1 = encode(df1, 'Wind Bearing (degrees)', 359)\n\n# Remove original date/time column\ndf1 = df1.drop(['Formatted Date'], axis=1)\n\n# Convert columns to factors\ndf1['Summary'] = df1['Summary'].astype('category')\ndf1['Precip Type'] = df1['Precip Type'].astype('category')\ndf1['Daily Summary'] = df1['Daily Summary'].astype('category')\n\n# Create a column stating whether its mostly cloudy / overcast or not in summary\ndf1['Heavy_Cloud'] = pd.np.where(df1.Summary.str.contains(\"Mostly Cloudy\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Overcast\"), 1,\n pd.np.where(df1.Summary.str.contains(\"Foggy\"), 1, 0)))\n\n# Convert to boolean and print count\ndf1['Heavy_Cloud'] = df1['Heavy_Cloud'].astype('bool')\n\n#%%\n\n# Create new value for X based on strongest variables\nX = scale(df1[['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n 'Month_cos', 'Visibility (km)']])\nX = pd.DataFrame(data=X)\ny = df1[\"Heavy_Cloud\"]\n\n#%%\n\n# Create training, validation and test data\n# Create Training&Validation / Test set - split of 70/20/10\nX_intermediate, X_test, y_intermediate, y_test = train_test_split(\n X, y, test_size=0.1)\nX_valid, X_train, y_valid, y_train = train_test_split(X_intermediate, y_intermediate,\n test_size=0.78)\n# delete intermediate variables\nX_intermediate, y_intermediate\n\nprint('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(df1), 2),\n round(\n len(y_valid)/len(df1), 2),\n round(len(y_test)/len(df1), 2)))\n\n#%%\n\nstart_time = time.clock()\n# Fit a logistic regression model to the training data\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\n\n# Print coefficients\nprint('Coefficeints', lrm.coef_)", "original_comment": "# Reshape validation data so that model can be run\n", "target_code": "X_valid = X_valid.values.reshape(-1, 6)\n", "project_metadata": {"full_name": "abhishek3aj/ML1819--task-101--team-06", "description": "ML framework comparison", "topics": [], "git_url": "git://github.com/abhishek3aj/ML1819--task-101--team-06.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-10-09T09:48:20Z", "size": 21107, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4638466, "Python": 84406}, "last_updated": "2018-12-17T19:27:23Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "start_time = time.clock()\nlrm = LogisticRegression()\nmodel = lrm.fit(X_train, y_train)\nprint(time.clock() - start_time, \"seconds\")\nprint('Coefficeints', lrm.coef_)\n", "model": "no-comments", "intent": "# Reshape validation data"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n#%%\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n#%%\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n#%%\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n#%%\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()", "original_comment": "# ### 1.5 Distribution of rgb channels of negative examples\n", "target_code": "for n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n", "project_metadata": {"full_name": "itratrahman/covid_19", "description": "This project contains AI and Data Science projects that analyses disease classification from images, forecasting, and EDA report of the pandemic.", "topics": [], "git_url": "git://github.com/itratrahman/covid_19.git", "stars": 5, "watchers": 5, "forks": 0, "created": "2020-03-22T03:36:28Z", "size": 26502, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6190010}, "last_updated": "2020-04-28T07:40:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.figure(figsize=(20, 20))\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, n+1)\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n", "model": "natural", "intent": "# 1.5 Distribution of rgb channels of negative examples"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n#%%\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n#%%\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n#%%\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n#%%\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n#%%\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n#%%\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n#%%\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n#%%\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n#%%\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n#%%\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)", "original_comment": "# ### Put them all together in multiplots\n", "target_code": "pzb.plot_multi(names=['fof', 'zz', 'pdf', 'pit'], verbose=1, save_plot=1)\n", "project_metadata": {"full_name": "LSSTDESC/pz_blend", "description": "impact of blending on photo-zs using DC2 truth catalogs and image catalogs", "topics": [], "git_url": "git://github.com/LSSTDESC/pz_blend.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-03-12T22:06:14Z", "size": 2183, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1709826, "Python": 89195}, "last_updated": "2020-12-09T18:50:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "pzb.plot_pdf(kde_bandwidth='scott')\n", "model": "no-comments", "intent": "# Put them all together in multiplots"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n\niris['sepal length (cm)'].count() # Applied to Series\n\n\niris['sepal width (cm)'].count() # Applied to Series\n\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n\niris.min()\n\n\niris.max()\n\n\n# #### Box Plots\n\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf.fare.describe()\n\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n\n# Print the mean of the January and March data\njanuary.mean()\n\n\nmarch.mean()\n\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n\nsetosa['species'].unique()\n\n\nversicolor['species'].unique()\n\n\nvirginica['species'].unique()\n\n\nsetosa.head(2)\n\n\nversicolor.head(2)\n\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n\ndescribe_all = iris.describe()\ndescribe_all\n\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n\nsales.reindex(evening_2_11, method='ffill')\n\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n\ndf1.head()\n\n\ndf2.head()\n\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n\ndaily_mean.loc['2015-2-2']\n\n\nsales.loc['2015-2-2', 'Units']\n\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n\nsales.resample('D').sum().head()\n\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\ndf_clean.head(3)\n\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n#%%\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n#%%\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n#%%\n\niris['sepal length (cm)'].count() # Applied to Series\n\n#%%\n\niris['sepal width (cm)'].count() # Applied to Series\n\n#%%\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n#%%\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n#%%\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n#%%\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n#%%\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n#%%\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n#%%\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n#%%\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n#%%\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n#%%\n\niris.min()\n\n#%%\n\niris.max()\n\n\n# #### Box Plots\n\n#%%\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n#%%\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n#%%\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n#%%\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n#%%\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf.fare.describe()\n\n#%%\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n#%%\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n#%%\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n#%%\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n#%%\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n#%%\n\n# Print the mean of the January and March data\njanuary.mean()\n\n#%%\n\nmarch.mean()\n\n#%%\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n#%%\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n#%%\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n#%%\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n#%%\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n#%%\n\nsetosa['species'].unique()\n\n#%%\n\nversicolor['species'].unique()\n\n#%%\n\nvirginica['species'].unique()\n\n#%%\n\nsetosa.head(2)\n\n#%%\n\nversicolor.head(2)\n\n#%%\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n#%%\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n#%%\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n#%%\n\ndescribe_all = iris.describe()\ndescribe_all\n\n#%%\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n#%%\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n#%%\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n#%%\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n#%%\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n#%%\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n#%%\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n#%%\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n#%%\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n#%%\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n#%%\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n#%%\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n#%%\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n#%%\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n#%%\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n#%%\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n#%%\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n#%%\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n#%%\n\nsales.reindex(evening_2_11, method='ffill')\n\n#%%\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf2.head()\n\n#%%\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n#%%\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n#%%\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n#%%\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n#%%\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n#%%\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n#%%\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n#%%\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n#%%\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n#%%\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n#%%\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n#%%\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n#%%\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n#%%\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n#%%\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n#%%\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n#%%\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n#%%\n\ndaily_mean.loc['2015-2-2']\n\n#%%\n\nsales.loc['2015-2-2', 'Units']\n\n#%%\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n#%%\n\nsales.resample('D').sum().head()\n\n#%%\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n#%%\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n#%%\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n#%%\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n#%%\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n#%%\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n#%%\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n#%%\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n#%%\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n#%%\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n#%%\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n#%%\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n#%%\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n#%%\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n#%%\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n#%%\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n#%%\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n#%%\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n#%%\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n#%%\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n#%%\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n#%%\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n#%%\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n#%%\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n#%%\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n#%%\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n#%%\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n#%%\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n#%%\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n#%%\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n#%%\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n#%%\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n#%%\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n#%%\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n#%%\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n#%%\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n#%%\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n#%%\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n#%%\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n#%%\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n#%%\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n#%%\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n#%%\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n#%%\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n#%%\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n#%%\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n#%%\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n#%%\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n#%%\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n#%%\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n#%%\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n#%%\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n#%%\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n#%%\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n#%%\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n#%%\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n#%%\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n#%%\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n#%%\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n#%%\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n#%%\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n#%%\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n#%%\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n#%%\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n#%%\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n#%%\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n#%%\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n#%%\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n#%%\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n#%%\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n#%%\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\ndf_clean.head(3)\n\n#%%\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n#%%\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n#%%\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n#%%\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n#%%\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n#%%\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n#%%", "original_comment": "# Calculate the mean of overcast_daily_max\n", "target_code": "overcast_daily_max_mean = overcast_daily_max.mean()\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "\n", "model": "no-comments", "intent": "# Calculate the mean of overcast_daily_max"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n\n# shape\ndf.shape\n\n\ndf\n\n\n# top 5 rows in dataframe\ndf.head()\n\n\ndf.info()\n\n\ndf.describe()\n\n\n# statistical details T is transpost\ndf.describe().T\n\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n\nX\n\n\ny\n\n\n# ## split this data into training and test sets\n\n\n\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n\n\n\n\nmodel = LinearRegression()\n\n\nmodel\n\n\n# ## Train model\n\n\nmodel.fit()\n\n\nmodel.fit(X_train, y_train)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n#%%\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n#%%\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n#%%\n\n# shape\ndf.shape\n\n#%%\n\ndf\n\n#%%\n\n# top 5 rows in dataframe\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.describe()\n\n#%%\n\n# statistical details T is transpost\ndf.describe().T\n\n#%%\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n#%%\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n#%%\n\nX\n\n#%%\n\ny\n\n\n# ## split this data into training and test sets\n\n#%%\n\n\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n#%%\n\n\n\n#%%\n\nmodel = LinearRegression()\n\n#%%\n\nmodel\n\n\n# ## Train model\n\n#%%\n\nmodel.fit()\n\n#%%\n\nmodel.fit(X_train, y_train)", "original_comment": "# ### Reshape Feature\n", "target_code": "X = X.reshape(-1, 1)\n", "project_metadata": {"full_name": "Jetsukda/ML-KBTGxMeowCode", "description": "Say \"Hello\" Machine Learning by KBTGxMeowCode", "topics": [], "git_url": "git://github.com/Jetsukda/ML-KBTGxMeowCode.git", "stars": 3, "watchers": 3, "forks": 34, "created": "2020-06-28T07:57:09Z", "size": 5316, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11012140}, "last_updated": "2020-09-01T17:59:00Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "y_pred = model.predict(X_test)\ny_pred\n", "model": "no-comments", "intent": "# Reshape Feature"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n#%%\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n#%%\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n#%%\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n#%%\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n#%%\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n#%%\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n#%%\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n#%%\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n#%%\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n#%%\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n#%%\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n#%%\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n#%%\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n#%%\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n#%%\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n#%%\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n#%%\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n#%%\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n#%%\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n#%%\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n#%%\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n#%%\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n#%%\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n#%%\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n#%%\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n#%%\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n#%%\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n#%%\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n#%%\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n#%%\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n#%%\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)", "original_comment": "# set axis labels\n", "target_code": "ax.set_xlabel('[month]', fontsize=10)\nax.set_ylabel('[year]', fontsize=10)\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_base.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)\n", "model": "natural", "intent": "# set axis labels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n#%%\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):", "original_comment": " # make sure output directory exists\n", "target_code": " output_dir = pathlib.Path(output_dir)\n output_dir.mkdir(exist_ok=True)\n", "project_metadata": {"full_name": "thehackerwithin/illinois", "description": "THW Chapter at U. Illinois", "topics": [], "git_url": "git://github.com/thehackerwithin/illinois.git", "stars": 13, "watchers": 13, "forks": 31, "created": "2015-02-18T19:38:33Z", "size": 61361, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 868658, "TeX": 34143, "R": 18922, "HTML": 10291, "Julia": 5254, "Python": 4028, "C++": 425, "CMake": 94}, "last_updated": "2020-09-30T18:16:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "metadata = pd.read_csv('metadata.csv')\nmetadata.head()\n", "model": "no-comments", "intent": " # make sure output directory exists"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n\ndata['sex'].unique()\n\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n#%%\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n#%%\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n#%%\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n#%%\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n#%%\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n#%%\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n#%%\n\ndata['sex'].unique()\n\n#%%\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n#%%\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n#%%\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n#%%\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()", "original_comment": "# and capture it into a dataframe\n", "target_code": "prob_df = pd.DataFrame(prob_df)\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "prob_df = pd.DataFrame(prob_df)\nprob_df\n", "model": "natural", "intent": "# capture it into a dataframe"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # FIT5202 Assignment 1 - Part B\n\n# ## Step 01: Import pyspark and initialise Spark\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import *\nfrom pyspark.sql import SparkSession\nfrom pyspark import SparkConf, SparkContext\nfrom datetime import datetime\nimport os\nos.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'\n\n# Importing necessary documents and libraries and creating entry points to spark\n\nsc = SparkContext.getOrCreate()\n\nconf = SparkConf().setMaster(\n \"local[*]\") .setAppName(\"FIT5202 Assignment 1 - Part B\")\nif sc == None:\n sc = SparkContext(conf=conf)\n\nspark = SparkSession(sparkContext=sc) .builder .config(\"spark.mongodb.input.uri\",\n \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .config(\"spark.mongodb.output.uri\", \"mongodb://127.0.0.1/FIT5202.Ass1PartB\") .getOrCreate()\n\n\n# ## Step 02: Create dataframe\n\n#%%\n\ncrimes = spark.read.csv(\n \"Crime_Statistics_SA_2010_present.csv\", header=True, inferSchema=True)\ncrimes = crimes.na.drop()\ncrimes.show()\n\n\n# ## Step 03: Write to Database\n\n#%%\n\ncrimes.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\n \"overwrite\").save()\n\n\n# ## Step 04: Read from Database\n\n#%%\n\ncrimes_df = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()\ncrimes_df.printSchema()\n\n\n# ## Step 05: Calculate the statistics of numeric and string columns\n\n#%%\n\n# The report count each day on all the dates\ncrime_by_date = crimes_df.groupBy(\"Reported Date\") .sum(\"Offence Count\") .withColumnRenamed(\n \"sum(Offence Count)\", \"Count\") .withColumnRenamed(\"Reported Date\", \"Date\")\ncrime_by_date.show()\n\n#%%\n\n# The average value of daily offence count\navg_count = crime_by_date.groupBy() .avg(\n \"Count\") .withColumnRenamed(\"avg(Count)\", \"Average Offence Count\")\navg_count.show()\n\n#%%", "original_comment": "# The standard deviation of daily offence count\n", "target_code": "std_count = crime_by_date.groupBy() .agg(stddev(\"Count\")\n ) .withColumnRenamed(\"stddev_samp(Count)\", \"Standard Deviation\")\n", "project_metadata": {"full_name": "WaicongTam/Assignment-Portfolio", "description": "This repository is showcase of the codes of my assignments. All the assignments I consider worth sharing will be updated here right after the late penalty has reached 50%.", "topics": [], "git_url": "git://github.com/WaicongTam/Assignment-Portfolio.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2019-06-01T03:27:31Z", "size": 10261, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1789685, "Java": 101530}, "last_updated": "2020-10-15T15:22:21Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "crimes_df.printSchema()\n", "model": "no-comments", "intent": "# The standard deviation of daily offence count"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---\n\n\n\nif lid.shape[0] > 0:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n#%%\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n#%%\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n#%%\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n#%%\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n#%%\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n#%%\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n#%%\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n#%%\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n#%%\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---", "original_comment": "# ## C. Calculate the Lateral Inflow Hydrographs:\n", "target_code": " ReducedTable = calc_lateral_inflow_hydro(\n lid, ReducedTable, StormwaterTable, durations, BCN, display_print)\n", "project_metadata": {"full_name": "Dewberry/pfra-hydromet", "description": "Tools for developing pluvial (excess rainfall) and fluvial scenarios for probabilistic flood risk analyses", "topics": ["hydrology", "papermill", "montecarlo-simulation"], "git_url": "git://github.com/Dewberry/pfra-hydromet.git", "stars": 11, "watchers": 11, "forks": 12, "created": "2019-04-18T13:04:55Z", "size": 165396, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 59869977, "Python": 186157}, "last_updated": "2020-10-27T14:37:20Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "\n", "model": "no-comments", "intent": " # C. Calculate the Lateral Inflow Hydrographs:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n\n# shape\ndf.shape\n\n\ndf\n\n\n# top 5 rows in dataframe\ndf.head()\n\n\ndf.info()\n\n\ndf.describe()\n\n\n# statistical details T is transpost\ndf.describe().T\n\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n\nX\n\n\ny\n\n\n# ## split this data into training and test sets\n\n\n\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n\n\n\n\nmodel = LinearRegression()\n\n\nmodel\n\n\n# ## Train model\n\n\nmodel.fit()\n\n\nmodel.fit(X_train, y_train)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n#%%\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n#%%\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n#%%\n\n# shape\ndf.shape\n\n#%%\n\ndf\n\n#%%\n\n# top 5 rows in dataframe\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.describe()\n\n#%%\n\n# statistical details T is transpost\ndf.describe().T\n\n#%%\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n#%%\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n#%%\n\nX\n\n#%%\n\ny\n\n\n# ## split this data into training and test sets\n\n#%%\n\n\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n#%%\n\n\n\n#%%\n\nmodel = LinearRegression()\n\n#%%\n\nmodel\n\n\n# ## Train model\n\n#%%\n\nmodel.fit()\n\n#%%\n\nmodel.fit(X_train, y_train)", "original_comment": "# ### Reshape Feature\n", "target_code": "X = X.reshape(-1, 1)\n", "project_metadata": {"full_name": "Jetsukda/ML-KBTGxMeowCode", "description": "Say \"Hello\" Machine Learning by KBTGxMeowCode", "topics": [], "git_url": "git://github.com/Jetsukda/ML-KBTGxMeowCode.git", "stars": 3, "watchers": 3, "forks": 34, "created": "2020-06-28T07:57:09Z", "size": 5316, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11012140}, "last_updated": "2020-09-01T17:59:00Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "X_train_reshaped = X_train.reshape(-1, 1)\nX_test_reshaped = X_test.reshape(-1, 1)\nmodel.fit(X_train_reshaped, y_train)\n", "model": "docstring", "intent": "# Reshape Feature"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's start with the basics\n\n\nsimple = list(range(1, 19))\nsimple\n\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n\n# Select the last item using positive indexation\nsimple[17]\n\n\n# Select the last item using negative indexation\nsimple[-1]\n\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n\n# What is the biggest number in the list?\nmax(simple)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's start with the basics\n\n#%%\n\nsimple = list(range(1, 19))\nsimple\n\n#%%\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n#%%\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n#%%\n\n# Select the last item using positive indexation\nsimple[17]\n\n#%%\n\n# Select the last item using negative indexation\nsimple[-1]\n\n#%%\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n#%%\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n#%%\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n#%%\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n#%%\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n#%%\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n#%%\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n#%%\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n#%%\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n#%%\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n#%%\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n#%%\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n#%%\n\n# What is the biggest number in the list?\nmax(simple)\n\n#%%", "original_comment": "# And the smallest?\n", "target_code": "min(simple)\n", "project_metadata": {"full_name": "shotleft/how-to-python", "description": null, "topics": [], "git_url": "git://github.com/shotleft/how-to-python.git", "stars": 11, "watchers": 11, "forks": 4, "created": "2018-05-03T04:32:17Z", "size": 3364, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2974562}, "last_updated": "2020-12-05T20:07:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print(max(simple))\nprint(min(simple))\n", "model": "natural", "intent": "# And the smallest?"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n\n# Checking for data types\n# fraud_df.dtypes\n\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n\n#fraud_df = fraud_df.head(10000)\n\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n\nfraud_df.head()\n\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n\nfraud_df[cat_col_onehotencode].head()\n\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n\nfraud_df.head()\n\n\n# fraud_df.columns\n\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n#%%\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n#%%\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n#%%\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n#%%\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n#%%\n\n# Checking for data types\n# fraud_df.dtypes\n\n#%%\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n#%%\n\n#fraud_df = fraud_df.head(10000)\n\n#%%\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n#%%\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n#%%\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n#%%\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n#%%\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n#%%\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n#%%\n\nfraud_df[cat_col_onehotencode].head()\n\n#%%\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n#%%\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# fraud_df.columns\n\n#%%\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n#%%\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n#%%\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n#%%\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n#%%\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n#%%\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n#%%\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n#%%\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n#%%\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n#%%\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n#%%\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)", "original_comment": "# Summarize the new class distribution\n", "target_code": "counter = Counter(y_train)\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "counter = Counter(y_train)\nprint(counter)\n", "model": "docstring", "intent": "# Summarize the new class distribution"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nplt.imshow(img1)\n\n\nplt.imshow(img2)\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\n# blending images of same size\n\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nlarge_img = img1\nsmall_img = img2\n\n\nx_offset = 0\ny_offset = 0\n\n\n# in numpy x axis is vertical and y axis is horizontal\n\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n\n# Blend images of different sizes\n\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n\nimg1.shape\n\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n\nimg2.shape\n\n\nrows, cols, channels = img2.shape\n\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n\nmask_inv.shape\n\n\n# you can see the image is 2D now\n\n\n\n\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n\nplt.imshow(white_bgd)\n\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n\nlarge_img = img1\nsmall_img = final_roi\n\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n\n\n\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n\nret\n\n\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\nshow_img(img)\n\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n\n\n\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n\ni = load_img()\nshow_img(i)\n\n\ngamma = 1/4\n\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n\nimg = load_img()\nshow_img(img)\n\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n\nimg = load_img()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Image Processing\n\n# - Goals\n# - Learn various image processing operations\n# - Perform image operations such as Smoothing, Blurring, Morphological Operations\n# - Grab properties such as color spaces and histograms\n\n# #### Class 1 - Color Mappngs\n\n# - So far we worked with RGB color spaces\n# - There are some other models like HSL (Hue, Saturation, Lightness) and\n# HSV(Hue, Saturation and Value)\n# - HSL and HSV are more aligned with human vision actually perceives\n# - While in this course we deal with RGB images, its a good idea to understand about HSV and HSL colorspaces\n\n#%%\n\nimport numpy as np\nimport cv2\nimport matplotlib.pyplot as plt\n\n#%%\n\nimg = cv2.imread('../Data/00-puppy.jpg')\nplt.imshow(img) # BGR cahannel\n\n#%%\n\n# converting to RGB\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n\n#%%\n\n# converting to HSV\nplt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))\n\n\n# #### Class 2 - Blending and Pasting Images\n\n# Blend images\n# Formula:\n# new_pixel = alpha x pixel_1(1st image) + beta x pixel_2(2nd image) + gamma\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nplt.imshow(img1)\n\n#%%\n\nplt.imshow(img2)\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\n# blending images of same size\n\n#%%\n\n# resize to equal sizes\nimg1 = cv2.resize(img1, (1200, 1200))\nimg2 = cv2.resize(img2, (1200, 1200))\n\n#%%\n\nprint('img1 shape: ', img1.shape)\nprint('img2 shape: ', img2.shape)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.5,\n src2=img2, beta=0.5, gamma=0)\nplt.imshow(blended)\n\n#%%\n\nblended = cv2.addWeighted(src1=img1, alpha=0.8,\n src2=img2, beta=0.2, gamma=0)\nplt.imshow(blended)\n\n#%%\n\n# Overlay small image on top of larger image\n# numpy reassignment\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nlarge_img = img1\nsmall_img = img2\n\n#%%\n\nx_offset = 0\ny_offset = 0\n\n#%%\n\n# in numpy x axis is vertical and y axis is horizontal\n\n#%%\n\nx_end = x_offset + small_img.shape[1]\ny_end = y_offset + small_img.shape[0]\n\n#%%\n\nlarge_img[y_offset:y_end, x_offset:x_end] = small_img\nplt.imshow(large_img)\n\n#%%\n\n# Blend images of different sizes\n\n#%%\n\nimg1 = cv2.cvtColor(cv2.imread('../Data/dog_backpack.png'),\n cv2.COLOR_BGR2RGB)\nimg2 = cv2.cvtColor(cv2.imread('../Data/watermark_no_copy.png'),\n cv2.COLOR_BGR2RGB)\n\n#%%\n\nimg2 = cv2.resize(img2, (600, 600)) # img2 is smaller than img1\n\n#%%\n\nimg1.shape\n\n#%%\n\nx_offset = 934 - 600\ny_offset = 1401 - 600\n\n#%%\n\nimg2.shape\n\n#%%\n\nrows, cols, channels = img2.shape\n\n#%%\n\n# region of interest\nroi = img1[y_offset:1401, x_offset:934]\nplt.imshow(roi)\n\n#%%\n\nimg2gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\nplt.imshow(img2gray, cmap='gray')\n\n#%%\n\nmask_inv = cv2.bitwise_not(img2gray)\nplt.imshow(mask_inv, cmap='gray')\n\n#%%\n\nmask_inv.shape\n\n#%%\n\n# you can see the image is 2D now\n\n#%%\n\n\n\n#%%\n\nwhite_bgd = np.full(img2.shape, 255, dtype=np.uint8)\nwhite_bgd.shape\n\n#%%\n\nplt.imshow(white_bgd)\n\n#%%\n\nbk = cv2.bitwise_or(white_bgd, white_bgd, mask=mask_inv)\nplt.imshow(bk)\n\n#%%\n\nfg = cv2.bitwise_or(img2, img2, mask=mask_inv)\nplt.imshow(fg)\n\n#%%\n\nfinal_roi = cv2.bitwise_or(roi, fg)\nplt.imshow(final_roi)\n\n#%%\n\nlarge_img = img1\nsmall_img = final_roi\n\n#%%\n\nlarge_img[y_offset:y_offset+small_img.shape[0],\n x_offset:x_offset+small_img.shape[1]] = small_img\nplt.imshow(large_img)\n\n\n# #### Class 3 Image Threshodling\n\n# - Thresholding is fundamentally a very simple method of segmenting an image into different parts\n# - Threshodling will convert an image to white or black\n\n#%%\n\n\n\n#%%\n\nimg = cv2.imread('../Data/rainbow.jpg')\nplt.imshow(img)\n\n#%%\n\n# read as grayscale\nimg_gray = cv2.imread('../Data/rainbow.jpg', 0)\nplt.imshow(img_gray, cmap='gray')\n\n\n# ###### Threshold types\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY)\n\n#%%\n\nret\n\n#%%\n\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_BINARY_INV)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nret, thresh1 = cv2.threshold(img_gray, thresh=img.max()/2, maxval=255,\n type=cv2.THRESH_TRUNC)\nprint(ret)\nplt.imshow(thresh1, cmap='gray')\n\n#%%\n\nimg = cv2.imread('../Data/crossword.jpg', 0)\nplt.imshow(img, cmap='gray')\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\nshow_img(img)\n\n#%%\n\nret, thr1 = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)\nshow_img(thr1)\n\n#%%\n\nthr2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C,\n cv2.THRESH_BINARY, 11, 9)\nshow_img(thr2)\n\n#%%\n\nblended = cv2.addWeighted(thr1, 0.7, thr2, 0.4, 0)\nshow_img(blended)\n\n\n# #### Class 4 Blurring and Smoothing\n\n# - Blurring/Smoothing is combined with edge detection\n# - Edge detection algorithms detect too many edges when shown a high resolution image without any blurring\n\n# - Methods\n# - Gamma Correction:\n# - can be applied to an image to make it appear brighter or darker depending on the Gamma value chosen\n# - Kernel Based Filters\n# - can be applied over an image to produce a variet of effects\n#\n\n#%%\n\n\n\n#%%\n\ndef load_img():\n img = cv2.imread('../Data/bricks.jpg').astype(np.float32) / 255\n img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n return img\n\n#%%\n\ndef show_img(img):\n fig = plt.figure(figsize=(15, 15))\n ax = fig.add_subplot(111)\n ax.imshow(img, cmap='gray')\n\n#%%\n\ni = load_img()\nshow_img(i)\n\n#%%\n\ngamma = 1/4\n\n#%%\n\nresult = np.power(i, gamma)\nshow_img(result)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.float32) / 25\nkernel\n\n#%%\n\ndst = cv2.filter2D(img, -1, kernel)\nshow_img(dst)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblurred = cv2.blur(img, ksize=(10, 10))\nshow_img(blurred)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\ngaussian_blur = cv2.GaussianBlur(img, (5, 5), 10)\nshow_img(gaussian_blur)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nmedian_blur = cv2.medianBlur(img, 5)\nshow_img(median_blur)\n\n#%%\n\nimg = cv2.cvtColor(cv2.imread('../Data/sammy.jpg'), cv2.COLOR_BGR2RGB)\nshow_img(img)\n\n#%%\n\nnoisy_img = cv2.imread('../Data/sammy_noise.jpg')\nshow_img(noisy_img)\n\n#%%\n\nmedian = cv2.medianBlur(noisy_img, 5)\nshow_img(median)\n\n#%%\n\nimg = load_img()\nfont = cv2.FONT_HERSHEY_COMPLEX\nshow_img(cv2.putText(img, text='bricks', org=(10, 600), fontFace=font,\n fontScale=10, color=(255, 0, 0), thickness=5))\nprint('reset')\n\n#%%\n\nblur = cv2.bilateralFilter(img, 9, 75, 75)\nshow_img(blur)\n\n\n# #### Class 5 Morphological Operators\n\n# - MO are sets of kernels that can achienve a variety of effects such as reducing noise\n# - Certain operators are very good at reducing black points on a white background\n# - Certain operators can also achieve an erosion and dilation effect that can add or erode from an existing image\n# - This effect is mostly seen on text data, so we will practisce various morphological operators on some simple white text on a balck background.\n\n#%%\n\ndef load_img():\n blank_img = np.zeros((600, 600))\n font = cv2.FONT_HERSHEY_SIMPLEX\n cv2.putText(blank_img, text='ABCDE', org=(20, 400), fontFace=font,\n fontScale=5, color=(255, 255, 255), thickness=30)\n return blank_img\n\n#%%\n\nimg = load_img()\nshow_img(img)\n\n#%%\n\nkernel = np.ones((5, 5), dtype=np.uint8)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=1)\nshow_img(result)\n\n#%%\n\nresult = cv2.erode(img, kernel, iterations=4)\nshow_img(result)\n\n#%%\n\nimg = load_img()", "original_comment": "# creating white noise\n", "target_code": "white_noise = np.random.randint(0, 2, size=(600, 600))\n", "project_metadata": {"full_name": "RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning", "description": null, "topics": [], "git_url": "git://github.com/RamjiB/Python-for-Computer-Vision-with-OpenCV-and-Deep-Learning.git", "stars": 3, "watchers": 3, "forks": 5, "created": "2019-05-28T02:31:41Z", "size": 48363, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 98148466, "Python": 466}, "last_updated": "2020-12-21T09:24:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "noise = np.random.white(100)\nshow_img(noise)\nkernel = np.ones((5, 5), dtype=np.uint8)\nresult = cv2.dilate(noise, kernel, iterations=1)\nshow_img(result)\nkernel = np.ones((5, 5), dtype=np.uint8)\nresult = cv2.dilate(noise, kernel, iterations=4)\nshow_img(result)\n", "model": "docstring", "intent": "# creating white noise"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n#%%\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')", "original_comment": "# normalize the format of DATE\n", "target_code": "train_target['Date'] = pd.to_datetime(train_target['Date'])\n", "project_metadata": {"full_name": "Quan-Sun/TADPOLE-ECE5970", "description": "machine learning with biomedical data", "topics": [], "git_url": "git://github.com/Quan-Sun/TADPOLE-ECE5970.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-11-16T21:39:24Z", "size": 15650, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5564392}, "last_updated": "2019-04-19T22:32:32Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "scaler = MinMaxScaler(feature_range=(0, 1))\ntrain_data = scaler.fit_transform(train_data)\n", "model": "docstring", "intent": "# normalize the format of DATE"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n\ndf_reviews.head()\n\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n#%%\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n#%%\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n#%%\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n#%%\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n#%%\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n#%%\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n#%%\n\ndf_reviews.head()\n\n#%%\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n#%%\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n#%%\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n#%%\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n#%%\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n#%%", "original_comment": "# Find value counts for every column in reviews.csv\n", "target_code": "df_reviews.apply(lambda x: x.isnull().value_counts())\n", "project_metadata": {"full_name": "sheetalbongale/ALE-gorithm", "description": "All things Beer! Beer Educator and Recommender Web App | Deployed on GCP > https://alegorithm-fxljyqhslq-uc.a.run.app/ | UT Data Analysis and Visualization Nov 2019 - May 2020. ", "topics": ["recommender", "gcp-cloud-build", "python-flask-application", "sqlalchemy", "plotlyjs", "anychart-javascript-library", "d3js", "mysql"], "git_url": "git://github.com/sheetalbongale/ALE-gorithm.git", "stars": 5, "watchers": 5, "forks": 5, "created": "2020-03-01T22:59:58Z", "size": 56307, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 204948, "JavaScript": 52038, "CSS": 48412, "HTML": 46213, "Python": 15403, "Dockerfile": 433}, "last_updated": "2020-05-07T08:39:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_beers[\"style\"].value_counts()\n", "model": "no-comments", "intent": "# Find value counts for every column in reviews.csv"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n#%%\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n#%%\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n#%%\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n#%%\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n#%%\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n#%%\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n#%%\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n#%%\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n#%%\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n#%%\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n#%%\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n#%%\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n#%%\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n#%%\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n#%%\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n#%%\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n#%%\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n#%%\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n#%%\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n#%%\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n#%%\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n#%%\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')", "original_comment": "# Tweak spacing to prevent clipping of ylabel\n", "target_code": "plt.subplots_adjust(left=0.15)\n", "project_metadata": {"full_name": "kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN", "description": "Predict HDR images from LDR images using CNN", "topics": [], "git_url": "git://github.com/kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2017-07-10T10:31:45Z", "size": 16499, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 96258, "Python": 44059, "MATLAB": 26466, "Shell": 15315, "M": 423}, "last_updated": "2020-07-07T08:49:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.gca().set_yticklabels([])\nplt.gca().set_xticklabels([])\nplt.hist(finHdrARR, num_bins, facecolor='green')\nplt.title(\"High Dynamic Range Data frequency\")\nplt.xlabel('Range')\nplt.ylabel('Frequency')\n", "model": "docstring", "intent": "# Tweak spacing to prevent clipping of ylabel"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n\nfeatures = iris_df.values\n\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n\n\n\n\n# ### Split the data into training and testing data, with random seeding\n\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n#%%\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n#%%\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n#%%\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n#%%\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n#%%\n\nfeatures = iris_df.values\n\n#%%\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n#%%\n\n\n\n#%%\n\n# ### Split the data into training and testing data, with random seeding\n\n#%%\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)", "original_comment": "# ### Creating an instance of LogisticRegrssion class\n", "target_code": "from sklearn.linear_model import LogisticRegression\n\nlogReg = LogisticRegression()\n", "project_metadata": {"full_name": "naveen21553/ml-workshop", "description": "Machine Learning Workshop Resources", "topics": [], "git_url": "git://github.com/naveen21553/ml-workshop.git", "stars": 12, "watchers": 12, "forks": 14, "created": "2018-09-28T15:03:08Z", "size": 5274, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 685393, "Python": 11705}, "last_updated": "2020-10-11T10:46:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1, "usefulness": "Agree", "usefulness-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "clf = LogisticRegression()\nclf.fit(x_train, y_train)\n", "model": "docstring", "intent": "# Creating an instance of LogisticRegrssion class"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n#%%\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n#%%\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n#%%\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n#%%\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()", "original_comment": "# ### 1.5 Distribution of rgb channels of negative examples\n", "target_code": "for n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n", "project_metadata": {"full_name": "itratrahman/covid_19", "description": "This project contains AI and Data Science projects that analyses disease classification from images, forecasting, and EDA report of the pandemic.", "topics": [], "git_url": "git://github.com/itratrahman/covid_19.git", "stars": 5, "watchers": 5, "forks": 0, "created": "2020-03-22T03:36:28Z", "size": 26502, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6190010}, "last_updated": "2020-04-28T07:40:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\nsns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\nsns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n", "model": "docstring", "intent": "# 1.5 Distribution of rgb channels of negative examples"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\nimport nltk\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n#%%\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n#%%\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n#%%\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n#%%\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''", "original_comment": " # Tokenize words in reviews:\n", "target_code": " import nltk\n\n df_with_reviews['rev_cleaned'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: nltk.word_tokenize(x))\n", "project_metadata": {"full_name": "MeredithLevsen/InsightProject", "description": "GameOn - Quickly evaluate board games based on user reviews", "topics": [], "git_url": "git://github.com/MeredithLevsen/InsightProject.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2018-07-17T17:31:15Z", "size": 541, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1045270, "HTML": 265408}, "last_updated": "2018-12-04T03:47:10Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_with_reviews['rev_cleaned'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: nltk.word_tokenize(x))\n", "model": "natural", "intent": " # Tokenize words in reviews:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n#%%\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n#%%\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n#%%\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n#%%\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n#%%\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n#%%\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n#%%\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n#%%\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n#%%\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n#%%\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n#%%", "original_comment": "# Plot the histogram for the values of each year.\n", "target_code": "plt.figure()\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.distplot(gdp['2007'])\n", "model": "natural", "intent": "# Plot the histogram for the values of each year."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ![TrustKeeper](img/TK_logo.png)\n#\n# # [Link to Medium story](https://blog.insightdatascience.com/fraud-prevention-in-peer-to-peer-p2p-transaction-networks-using-neural-nets-a-node-embedding-b14a99f8ba30)\n\n# ![TrustKeeperHowItWorks](img/TK_how.png)\n#\n# # Description\n#\n# #### In this notebook, we show how to implement TrustKeeper algorithm to predict fraudulent transactions in the context of a Peer-to-Peer (P2P) Bitcoin transaction network\n#\n# #### We will first compute node embeddings using the Node2Vec algorithm and the information from the adjacency matrix in the Bitcoin network. Finally, we will use these node representations to train different classifiers for predicting transaction scores.\n\n# ### Import basic libraries\n\n#%%\n\nimport pickle\nfrom keras.layers.merge import Concatenate\nfrom keras.utils import np_utils\nfrom keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten\nfrom keras.wrappers.scikit_learn import KerasClassifier\nfrom keras.models import load_model\nfrom keras.layers import Dense\nfrom keras.models import Sequential\nimport keras\nfrom sklearn.metrics import roc_curve\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import classification_report\nfrom sklearn.metrics import confusion_matrix as cm\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.preprocessing import StandardScaler, RobustScaler\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport csv\nimport itertools\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # **Part 1 - Preprocessing Transaction Rating Data**\n#\n# # How do people rate each other on Bitcoin Marketplaces?\n#\n# ![TrustKeeperScore](img/TK_rating.png)\n#\n# #### Load TRIMMED_NETWORK for extracting the node data that will be used to train models. Let's look at a portion of the table\n\n#%%\n\ntrim_network = pd.read_csv('2_TRIMMED_NETWORK.csv')\ntrim_network.head()\n\n\n# #### Define function to extract node features from the network dataframe\n\n#%%\n\ndef GetNodeFeatures(GDF, n):\n if n == 0:\n return ['in_degree', 'pos_in_edges', 'neg_in_edges', 'out_degree', 'pos_out_edges', 'neg_out_edges']\n # Get sub-dataframes\n outgoing = GDF[GDF['SOURCE'] == n].copy().reset_index(drop=True)\n incoming = GDF[GDF['TARGET'] == n].copy().reset_index(drop=True)\n\n # Compute node degree (incoming)\n in_degree = len(incoming)\n\n # Compute node degree (outgoing)\n out_degree = len(outgoing)\n\n # Compute number of incoming nodes with positive ratings\n pos_in_edges = len([i for i in incoming.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_in_edges = len([i for i in incoming.RATING.values if i < 0])\n\n # Repeat for outgoing edges\n pos_out_edges = len([i for i in outgoing.RATING.values if i > 0])\n\n # Computer number of incoming nodes with negative ratings\n neg_out_edges = len([i for i in outgoing.RATING.values if i < 0])\n\n return np.array([in_degree, pos_in_edges, neg_in_edges, out_degree, pos_out_edges, neg_out_edges])\n\n\n# #### Obtain node features for all nodes in trimmed network\n\n#%%\n\n# Define list of nodes and sort it in ascending order\nnodes = list(set(trim_network['SOURCE']) | set(trim_network['TARGET']))\nnodes.sort()\n\n# Create NF, the node feature matrix\nNF = np.zeros([len(nodes), 6])\nfor i, n in enumerate(nodes):\n NF[i, :] = np.array(GetNodeFeatures(trim_network, n))\n\n# Create a dataframe containing all these node features\nNF_df = pd.DataFrame(columns=GetNodeFeatures(\n trim_network, 0), data=NF, index=nodes)\nNF_df.to_csv('trimmed_node_features.csv')\n\n#%%\n\nNF_df = pd.read_csv('trimmed_node_features.csv', index_col=0)\n\n\n# #### Create an edgelist file for the trimmed network\n\n#%%\n\nf = open('trimmed_network.edgelist', 'w')\nfor i in range(len(trim_network)):\n s = str(trim_network['SOURCE'][i])\n t = str(trim_network['TARGET'][i])\n f.write(s+' '+t+'\\n')\nf.close()\n\n\n# #### Run [Node2Vec](https://snap.stanford.edu/node2vec/) to learn an embedding for each node in the trimmed_network\n#\n# We will set the embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.\n#\n# Run this command in your terminal inside this directory (make sure your python environment is set to python=2.7). This step will take a while\n# ```bash\n# python ./node2vec/src/main.py --input ./trimmed_network.edgelist --output ./trimmed_network.emb --dimensions 14 --num-walks 25 --iter 15\n# ```\n\n# #### Create dictionary of nodes:embeddings fromthe trimmed_network.emb file created above\n\n#%%\n\nn = []\ne = []\nwith open('./trimmed_network.emb') as fin:\n for line in fin:\n node_emb = line.strip().split()\n n.append(node_emb[0])\n e.append(node_emb[1:])\nn = n[1:]\nn = [int(i) for i in n]\nembs = np.zeros([len(e)-1, 14])\nfor i in range(1, len(e)):\n embs[i-1] = e[i]\nembs.shape\n\n\n# #### Create a 2D projection of the embeddings using [t-SNE](https://lvdmaaten.github.io/tsne/)\n\n#%%\n\n#tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)\n#proj = tsne.fit_transform(embs)\n# np.save('trimmed_network_tSNE_projection.npy',proj)\nproj = np.load('trimmed_network_tSNE_projection.npy')\n\n\n# #### Plot node projections and color them according to the Bitcoin Marketplace they belong to (OTC or ALPHA)\n\n#%%\n\nalpha_nodes = []\notc_nodes = []\nfor i in range(len(n)):\n if n[i] > 6005: # The highest OTC node ID is 6005\n alpha_nodes.append(i)\n else:\n otc_nodes.append(i)\n\n#%%\n\nplt.figure(figsize=(10, 10))\nalpha = proj[alpha_nodes, :]\notc = proj[otc_nodes, :]\nplt.scatter(alpha[:, 0], alpha[:, 1], c='b', label='Bitcoin Alpha')\nplt.scatter(otc[:, 0], otc[:, 1], c='r', label='Bitcoin OTC')\nplt.legend()\nplt.title('2-dimensional mapping with t-SNE of Bitcoin network nodes with Node2Vec')\n\n\n# #### Thus, Node2Vec is able to represent nodes in a 14-dimensional space that accounts for the network structure the nodes' neighborhoods. We will expand these node representations by concatenating the 6 node features we calculated earlier so that we get a final 20-dimensional representation for each node.\n#\n# #### **Normalize node features and concatenate with Node2Vec embeddings**\n\n#%%\n\n# Recall that we have 14 Node2Vec features plus 6 features extracted from the rating network\nNF_matrix = np.zeros([len(n), 20])\nfor i in range(len(n)):\n NF_matrix[i] = np.concatenate(\n [embs[i], np.log2(NF_df.loc[n[i]].values + 1)])\nNF_matrix.shape\n\n\n# #### Because all 20 features have different scales, we will normalize the entire matrix using the following formula:\n#\n# $$X_{normalized}=\\frac{X - \\mu_{X}}{\\sigma_{X}}$$\n#\n# Where $\\mu_X$ and $\\sigma_X$ are the columnwise mean/STD values of the matrix $X$. In this way, we ensure that all values in the matrix lie within the $[-3\\sigma_X,3\\sigma_X]$ range\n\n#%%\n\nNF_matrix_mean = np.mean(NF_matrix, axis=0)\nNF_matrix_std = np.std(NF_matrix, axis=0)\nNF_matrix_norm = (NF_matrix - NF_matrix_mean) / (NF_matrix_std)\n\n\n# #### Capture node ID and embedding into a DataFrame\n\n#%%\n\nemb_df_norm = pd.DataFrame(columns=list(\n range(1, 21)), index=n, data=NF_matrix_norm)\nemb_df_norm.head()\n\n#%%\n\nemb_df_norm.to_csv('./trimmed_node_embeddings_mean_normalized.csv')\n\n\n# #### Construct the input matrix $X$ and the output vector $y$. Each row of the $X$ matrix contains 40 values (20 values for the source node or buyer, 20 values for the target node or seller). The $y$ vector captures the rating of the transaction the seller received from the buyer.\n#\n# #### **NOTE:** Here we assign a score of 1 if transaction is rated badly (fraud), and 0 otherwise\n\n#%%\n\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized.csv',index_col=0)\n#emb_df_norm = pd.read_csv('trimmed_node_embeddings_normalized_with_log10.csv',index_col=0)\nemb_df_norm = pd.read_csv(\n './trimmed_node_embeddings_mean_normalized.csv', index_col=0)\n\n\n# ### Create X input matrix and y output vector\n\n#%%\n\nX = np.zeros([len(trim_network), 40])\ny = np.zeros(len(trim_network))\nfor i in range(len(trim_network)):\n s = trim_network['SOURCE'][i]\n t = trim_network['TARGET'][i]\n r = trim_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X[i, :] = np.concatenate([left, right])\n if r > 0:\n y[i] = 0 # If not fraud\n else:\n y[i] = 1 # If fraud\n\n\n# #### Repeat procedure for calculating the $X_{val}$ and $y_{val}$ from the validation table\n\n#%%\n\nval_network = pd.read_csv('3_TEST_NETWORK.csv')\nX_val = np.zeros([len(val_network), 40])\ny_val = np.zeros(len(val_network))\nfor i in range(len(val_network)):\n s = val_network['SOURCE'][i]\n t = val_network['TARGET'][i]\n r = val_network['RATING'][i]\n left = emb_df_norm.loc[s, :].values\n right = emb_df_norm.loc[t, :].values\n X_val[i, :] = np.concatenate([left, right])\n if r > 0:\n y_val[i] = 0 # If not fraud\n else:\n y_val[i] = 1 # If fraud (class imbalance)\n\n\n# # **Part 2 - Model Construction, training, and evaluation**\n#\n# #### Models evaluated:\n#\n# * **TrusKeeper** (Deep Neural Network trained with both Node2Vec features and perception scores)\n# * Deep Neural Network trained with perception scores only\n# * Logistic Regression trained with both Node2Vec features and perception scores\n# * Logistic Regression trained with perception scores only\n\n# #### Load Machine/Deep Learning libraries\n\n#%%\n\n# Machine Learning\n\n# Deep Learning\n\n\n# #### Define function for generating small samples from training data to train Neural Networks without biasing it towards the most abundant category (0)\n\n#%%\n\n# Create train and test sets\nx_train, x_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\ndef CreateSample(frac=0.8):\n fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 1])\n not_fraud_indices = np.array(\n [i for i in range(len(y_train)) if y_train[i] == 0])\n sample_size = int(np.round(len(fraud_indices) * frac))\n fraud_sample = np.random.randint(0, len(fraud_indices), sample_size)\n not_fraud_sample = np.random.randint(\n 0, len(not_fraud_indices), sample_size)\n y_sample = np.concatenate(\n [y_train[fraud_indices][fraud_sample], y_train[not_fraud_indices[not_fraud_sample]]])\n x_sample = np.concatenate(\n [x_train[fraud_indices][fraud_sample], x_train[not_fraud_indices[not_fraud_sample]]])\n return x_sample, y_sample\n\n\n# ### Define a function to plot Confusion Matrices\n\n#%%\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.Blues):\n \"\"\"\n This function prints and plots the confusion matrix.\n Normalization can be applied by setting `normalize=True`.\n \"\"\"\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n print(\"Normalized confusion matrix\")\n else:\n print('Confusion matrix, without normalization')\n\n print(cm)\n\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, rotation=45)\n plt.yticks(tick_marks, classes)\n\n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt),\n horizontalalignment=\"center\",\n color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.ylabel('True label')\n plt.xlabel('Predicted label')\n plt.tight_layout()\n\n\n# ### Construct TrustKeeper model\n\n#%%\n\n# Initialize model\n\ninput_size = 40 # Features from X matrix\n\nM1 = Sequential()\n\n# Add an input layer\nM1.add(Dense(128, activation='relu', input_shape=(input_size,)))\n\n# Add first hidden layer\nM1.add(Dense(128, activation='relu'))\n\n# Add second hidden layer\nM1.add(Dense(64, activation='relu'))\n\n# Add third hidden layer\nM1.add(Dense(32, activation='relu'))\n\n# Add fourth hidden layer\nM1.add(Dense(16, activation='relu'))", "original_comment": "# Add output layer\n", "target_code": "M1.add(Dense(1, activation='sigmoid'))\n", "project_metadata": {"full_name": "insight-decentralized-consensus-lab/TrustKeeper", "description": "A fraud prevention system for Peer-to-Peer transaction networks (Jahir M Gutierrez)", "topics": [], "git_url": "git://github.com/insight-decentralized-consensus-lab/TrustKeeper.git", "stars": 8, "watchers": 8, "forks": 7, "created": "2018-09-28T20:15:21Z", "size": 10845, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 614132, "Scala": 19178, "Python": 14102}, "last_updated": "2020-03-18T22:55:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "M1.add(Dense(1, activation='sigmoid'))\n", "model": "docstring", "intent": "# Add output layer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n\nX_train.shape, Y_train.shape\n\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n#%%\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n#%%\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n#%%\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n#%%\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n#%%\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n#%%\n\nX_train.shape, Y_train.shape\n\n#%%\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n#%%\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n#%%\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n#%%\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n#%%\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n#%%\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n#%%\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n#%%\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n#%%\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n#%%\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n#%%\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n#%%\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n#%%\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n#%%\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n#%%\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n#%%\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n#%%\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n#%%\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n#%%\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n#%%\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n#%%\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n#%%\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n#%%\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n#%%\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n#%%\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n#%%", "original_comment": "# sample from the forecasted distribution\n", "target_code": "smpl = yhat.sample(100).numpy()\n", "project_metadata": {"full_name": "sinusgamma/multimodal_network", "description": "Mixture Density Network with Tensorflow Probability. Demonstrate the usefulness of multi-modal distribution outputs for neural networks.", "topics": [], "git_url": "git://github.com/sinusgamma/multimodal_network.git", "stars": 11, "watchers": 11, "forks": 0, "created": "2020-03-08T10:08:43Z", "size": 3194, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1217660}, "last_updated": "2021-01-04T15:29:04Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "compatibility": "Disagree", "compatibility-score": 1, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "yhat_sample = gm_test.sample()\nprint(yhat_sample)\n", "model": "docstring", "intent": "# sample from the forecasted distribution"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n\n', '.join([cat for cat in fs_categories])\n\n\ncinema = df_cinemas.loc[0]\n\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n#%%\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n#%%\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n#%%\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n#%%\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n#%%\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n#%%\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n#%%\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n#%%\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n#%%\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n#%%\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n#%%\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n#%%\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n#%%\n\n', '.join([cat for cat in fs_categories])\n\n#%%\n\ncinema = df_cinemas.loc[0]\n\n#%%\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n#%%\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n#%%\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n#%%\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n#%%\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n#%%\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation", "original_comment": "# Check the cinemas dataset contains any duplicated address\n", "target_code": "duplicated = df_cinemas.duplicated('Address', keep=False)\n", "project_metadata": {"full_name": "meghsat/CourseraIBMdatascience_course", "description": "In this repo consists of the projects I had done as part of the coursera's IBM data science Professional certificate.", "topics": [], "git_url": "git://github.com/meghsat/CourseraIBMdatascience_course.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-04-08T05:37:45Z", "size": 4855, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 14626378}, "last_updated": "2020-05-28T09:51:40Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "cinema.Address.duplicated().sum()\n", "model": "natural", "intent": "# Check the cinemas dataset contains any duplicated address"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n\ncombine[0]\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n#%%\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n#%%\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n#%%\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n#%%\n\ncombine[0]", "original_comment": "# instead we can reindex:\n", "target_code": "combine.index = range(combine.count())\n", "project_metadata": {"full_name": "ContextLab/CDL-tutorials", "description": "Repo containing useful tutorials on different topics, methods, software tools, and packages used by the CDL", "topics": ["tutorial", "training-materials", "python", "bayesian-methods", "package-creation", "scientific-computing"], "git_url": "git://github.com/ContextLab/CDL-tutorials.git", "stars": 12, "watchers": 12, "forks": 2, "created": "2017-12-15T13:36:50Z", "size": 59045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 694197, "Python": 17099, "TeX": 9149, "Makefile": 5644, "Batchfile": 5096, "Dockerfile": 3050, "Shell": 128}, "last_updated": "2020-07-13T19:39:57Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "combine = pd.concat([s1, s2], axis=1)\ncombine\n", "model": "no-comments", "intent": "# we can reindex:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n\n# ##Inspecting the data\n\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n\nsns.countplot(x='Class', data=dat)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n#%%\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n#%%\n\n# ##Inspecting the data\n\n#%%\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n#%%\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n#%%\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n#%%\n\nsns.countplot(x='Class', data=dat)", "original_comment": "# We won't be using \"Time\" variable\n", "target_code": "dat = dat.drop(['Time'], 1)\n", "project_metadata": {"full_name": "dpanagop/ML_and_AI_examples", "description": null, "topics": [], "git_url": "git://github.com/dpanagop/ML_and_AI_examples.git", "stars": 2, "watchers": 2, "forks": 2, "created": "2019-07-16T10:55:13Z", "size": 12192, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5902376}, "last_updated": "2020-11-24T20:45:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Strongly disagree", "precision-score": 0, "coverage": "Strongly disagree", "coverage-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "dat['Time'] = pd.to_datetime(dat['Time'])\n", "model": "docstring", "intent": "# We won't be using \"Time\" variable"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata[data.columns[data.dtypes == int]]\n\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata[data.columns[data.dtypes == int]]\n\n#%%\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n#%%\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)", "original_comment": "# ### Compute the explained variance for new data set.\n", "target_code": "pca.get_covariance()\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "explained_variance = pca.explained_variance_ratio_\nexplained_variance\n", "model": "docstring", "intent": "# Compute the explained variance for new data set."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n\ntrainingSummary = model.summary\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n#%%\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n#%%\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n#%%\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n#%%\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n#%%\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n#%%\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n#%%\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n#%%\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n#%%\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n#%%\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n#%%\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n#%%\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n#%%\n\ntrainingSummary = model.summary", "original_comment": "# train RMSE, MSE\n", "target_code": "print(\"RMSE: {}\".format(trainingSummary.rootMeanSquaredError))\nprint(\"MSE: {}\".format(trainingSummary.meanSquaredError))\nprint(\"R2: {}\".format(trainingSummary.r2))\n", "project_metadata": {"full_name": "dangkhoadl/my-BigData", "description": "A cache to store my Distributed System and Big Data resources", "topics": ["big-data", "coursera", "operating-systems", "distributed-systems", "cloud-computing"], "git_url": "git://github.com/dangkhoadl/my-BigData.git", "stars": 7, "watchers": 7, "forks": 8, "created": "2017-12-23T05:56:43Z", "size": 49086, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 888066, "C++": 48288, "Shell": 6317, "Python": 3334, "Makefile": 990}, "last_updated": "2020-01-21T03:30:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "precision": "Agree", "precision-score": 2, "coverage": "Disagree", "coverage-score": 1, "usefulness": "Agree", "usefulness-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "evaluation = model.evaluate(test_data)\nprint(\"RMSE: {}\".format(evaluation.rootMeanSquaredError))\nprint(\"MSE: {}\".format(evaluation.meanSquaredError))\nprint(\"R2: {}\".format(evaluation.r2))\n", "model": "docstring", "intent": "# train RMSE, MSE"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n#%%", "original_comment": "# Let's import the set\n", "target_code": "import pandas as pd\n\ndf = pd.read_csv('../data/train.csv', index_col=0)\n", "project_metadata": {"full_name": "commit-live-students/GLabs_DSMX", "description": null, "topics": [], "git_url": "git://github.com/commit-live-students/GLabs_DSMX.git", "stars": 6, "watchers": 6, "forks": 23, "created": "2020-03-27T12:43:39Z", "size": 19480, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12966885}, "last_updated": "2020-12-24T07:12:28Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "dataset = pd.read_csv('../data/titanic.csv')\n", "model": "natural", "intent": "# Let's import the set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's start with the basics\n\n\nsimple = list(range(1, 19))\nsimple\n\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n\n# Select the last item using positive indexation\nsimple[17]\n\n\n# Select the last item using negative indexation\nsimple[-1]\n\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n\n# What is the biggest number in the list?\nmax(simple)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's start with the basics\n\n#%%\n\nsimple = list(range(1, 19))\nsimple\n\n#%%\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n#%%\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n#%%\n\n# Select the last item using positive indexation\nsimple[17]\n\n#%%\n\n# Select the last item using negative indexation\nsimple[-1]\n\n#%%\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n#%%\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n#%%\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n#%%\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n#%%\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n#%%\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n#%%\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n#%%\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n#%%\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n#%%\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n#%%\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n#%%\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n#%%\n\n# What is the biggest number in the list?\nmax(simple)\n\n#%%", "original_comment": "# And the smallest?\n", "target_code": "min(simple)\n", "project_metadata": {"full_name": "shotleft/how-to-python", "description": null, "topics": [], "git_url": "git://github.com/shotleft/how-to-python.git", "stars": 11, "watchers": 11, "forks": 4, "created": "2018-05-03T04:32:17Z", "size": 3364, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2974562}, "last_updated": "2020-12-05T20:07:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "simple = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\nsimple.sort()\nsimple\n", "model": "no-comments", "intent": "# And the smallest?"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n#%%\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n#%%\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n#%%\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n#%%\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n#%%\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n#%%\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n#%%\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n#%%\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n#%%\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n#%%\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n#%%\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n#%%\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n#%%\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n#%%\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n#%%\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n#%%\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n#%%\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n#%%\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n#%%\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n#%%\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n#%%\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n#%%\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n#%%\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n#%%\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n#%%\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n#%%\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n#%%\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n#%%\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n#%%\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n#%%\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n#%%\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)", "original_comment": "# set axis labels\n", "target_code": "ax.set_xlabel('[month]', fontsize=10)\nax.set_ylabel('[year]', fontsize=10)\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "backtest_results_eurusd = backtest_results_eurusd.join(\n backtest_results_eurusd_details)\nbacktest_results_eurusd.head(10)\n", "model": "no-comments", "intent": "# set axis labels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Vectors, matrices and norms\n\n# The notebook demonstrate the computation and use of some important concepts in linear algebra. NumPy is used for the numerical computations.\n\n# ## Vector norms\n\n# The $l_{p}$-norm,of a vector $\\boldsymbol{x} \\in \\mathbb{C}^{n}$ is\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{p} = \\left( \\sum_{i=1}^{n} |x_{i}|^{p} \\right)^{1/p}\n# $$\n#\n# Recall that when $p = \\infty$, we have have the maxiumum norm:\n#\n# $$\n# \\| \\boldsymbol{x} \\|_{\\infty} = \\max(|x_{1}|, \\ldots , |x_{n}|)\n# $$\n#\n#\n#\n# NumPy can compute $l_{p}$ norms of vectors. To see how, we first import NumPy and create a random vectors of length 10:\n\n#%%\n\nimport scipy.linalg as la\nimport numpy as np\nnp.random.seed(2)\n\nx = np.random.rand(10) + 1j*np.random.rand(10)\nprint(x)\n\n\n# We can now compute a number of $l_{p}$ norms of $\\boldsymbol{x}$:\n\n#%%\n\nfor p in range(1, 5):\n x_norm = np.linalg.norm(x, p)\n print(\"The l_{} norm of x is: {}\".format(p, x_norm))\n\n\n# For the $l_{\\infty}$ norm:\n\n#%%\n\nx_inf = np.linalg.norm(x, np.inf)\nprint(\"The max norm of x is: {}\".format(x_inf))\n\n\n# ## Matrix norms\n\n# Norms of matrices can also be computed. The more interesting (and abstract) norms are *operator* norms. These are also known as *induced* norms.\n\n# ### Operator norms\n\n# For an $n \\times n$ matrix $\\boldsymbol{A}$, the norm of the matrix is a measure of the 'maximum change' in relative length it can induce when applied to a vector. If we consider:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le C \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# then the smallest possible $C$ is the norm of $\\boldsymbol{A}$. The norm of $\\boldsymbol{A}$ is denoted by $\\|\\boldsymbol{A}\\|$:\n#\n# $$\n# \\| \\boldsymbol{A} \\boldsymbol{x} \\| \\le \\| \\boldsymbol{A}\\| \\| \\boldsymbol{x}\\| \\quad \\forall \\boldsymbol{x} \\in \\mathbb{C}^{d},\n# $$\n#\n# This can be rearranged to provide the usual definition of a matrix norm:\n#\n# $$\n# \\| \\boldsymbol{A} \\| = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|}{\\|\\boldsymbol{x}\\| }\n# $$\n#\n# To compute actual norms of a matrix, we need to choose how we measure the length of a vector, i.e. which norm to use. If we choose the $l_{2}$-norm, then:\n#\n# $$\n# \\| \\boldsymbol{A} \\|_{2} = \\max_{\\boldsymbol{x} \\in \\mathbb{C}^{n} \\backslash \\boldsymbol{0}}\n# \\frac{\\| \\boldsymbol{A} \\boldsymbol{x}\\|_{2}}{\\|\\boldsymbol{x}\\|_{2} }\n# $$\n#\n# As discussed in the lectures, some norms are relatively inexpensive to compute for large matrices, and others are expensive. We can again use NumPy to compute some matrix norms. We first create a matrix filled with random numbers:\n\n#%%\n\nA = np.random.rand(5, 5) + 1j*np.random.rand(5, 5)\nprint(A)\n\n\n# and then compute some norms:\n\n#%%\n\nprint(\"The 1-norm of A is: {}\".format(np.linalg.norm(A, 1)))\nprint(\"The 2-norm of A is: {}\".format(np.linalg.norm(A, 2)))\nprint(\"The max-norm of A is: {}\".format(np.linalg.norm(A, np.inf)))\n\n\n# ### Vector-like norms\n\n# It sometimes convenient to work with matrix norms that are similar to vector norms. A commonly used matrix norm is the Frobenius norm. It is analogous to the $l_{2}$ norm of a vector, and is defined by:\n#\n# $$\n# \\|\\boldsymbol{A} \\|_{F} = \\left( \\sum_{i}\\sum_{i} a_{ij}^{2} \\right)^{1/2}.\n# $$\n#", "original_comment": "# To compute the Frobenius norm:\n", "target_code": "A_frobenius = np.linalg.norm(A, 'fro')\n", "project_metadata": {"full_name": "garth-wells/notebooks-3M1", "description": "Jupyter notebooks (Python) for the course 3M1 at the Department of Engineering, University of Cambridge", "topics": ["linear-algebra", "singular-value-decomposition", "regression"], "git_url": "git://github.com/garth-wells/notebooks-3M1.git", "stars": 10, "watchers": 10, "forks": 18, "created": "2015-01-12T22:32:25Z", "size": 128315, "license": "bsd-2-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7472485}, "last_updated": "2021-01-04T10:34:46Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "np.linalg.norm(A, 1)\n", "model": "docstring", "intent": "# To compute the Frobenius norm:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n\ncorpus = train_df.bow\n\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Text classification for topic-specific newspaper collections\n\n# Text classification is the process of categorizing text into pre-defined groups. By using Natural Language Processing (NLP), text classifiers can automatically analyze text and then assign a set of given categories based on the research question. This automated classification of text into predefined categories is an important method for managing and processing a large number of newspaper clippings. This also applies to subcorpora for a specific research topic (e.g. migration). The aim of this notebook is to train a model using your previously manually created training/test corpus and to use this model to get an overview of the category distribution throughout your collection (see figure below). Another goal is to export your categorized data for further analysis. This makes it possible to examine, for example, the advertisement about a specific topic.\n#\n# This notebook was used with a collection for the case study on emigration (1850-1950) and shows how a model can be trained to classify topic-specific collections. For the training/testing corpus, a collection with the keywords \"Auswander*\", \"Ausgewanderte\", \"Emigrant*\", \"Emigrierte\", \"Emigration\", \"Kolonist*\", and \"Ansiedler*\" (all different German words for emigrants or emigration) have been created. In addition, information on the pre-defined gropus (news, ads, culture...) were added using numbers between one and ten.\n#\n# For classification, topic modelling (LDA) was chosen because it showed the best performance in classification (after experiments with word embeddings or LDA and word embeddings combined). LDA provides a way to group documents by topic and perform similarity searches and improve precision. Thanks to sklearn, it is relatively easy to test different classifiers for a given topic classification task. Logistic regression was chosen as binary classifier.\n#\n# *Following graph demonstrates the distribution of the pre-defined categories in newspaper clippings of seleceted Austrian Newspapers (sample of 1631 newspaper clippings) on the topic of emigration.*\n#\n# ![Collection on the topic of Emigration](images/cat.PNG)\n#\n#\n# Read more about Topic Modeling and Logistic Regression Model Tuning.\n#\n# Acknowledgments:\n#\n# This work has been inspired by a notebook on LDA and word embeddings and several other soursces that provided help on how to buid models. This work was supported by the European Union's Horizon 2020 research and innovation programme under grant 770299 (NewsEye).\n\n# ## Step by step...\n# * [Prepare a small manually annotated collection](#1-bullet)\n# * [Install packages in command line](#2-bullet)\n# * [Import packages](#3-bullet)\n# * [Import your manually annotated newspaper collection](#4-bullet)\n# * [Clean and tokenize the text (pre-processing)](#5-bullet)\n# * [Have a look at your data](#6-bullet)\n# * [Use your dataset to create a training corpus and test corpus](#7-bullet)\n# * [Create topic models using your training corpus](#8-bullet)\n# * [Have a look at your topics](#9-bullet)\n# * [Create the feature vector ](#10-bullet)\n# * [Have a look at the top words for each category](#11-bullet)\n# * [Classification and hyperparameter tuning](#12-bullet)\n# * [Using the test corpus](#13-bullet)\n# * [Logistic Regression](#14-bullet)\n# * [Now it is time to make the classifications](#15-bullet)\n# * [Calculate the score for each category as well as the overall score](#16-bullet)\n# * [If your overall score is higher than 80 percent, you can start to use your whole collection](#17-bullet)\n# * [Clean (pre-process) your whole collection](#18-bullet)\n# * [Now it is time to make the classifications for the whole collection](#19-bullet)\n# * [Create a dataframe with the results](#20-bullet)\n# * [If you are satisfied with the results, you can save them in the form of your original file](#21-bullet)\n# * [Visualize your results](#22-bullet)\n#\n\n# ## Prepare a small manually annotated collection \n#\n# This program uses annotations for evaluation and classification. Therefore, a manually annotated collection of 80 to 100 articles per category is needed to work with this program. To create this collection, the numbers 0 to 7 have been assignet to the articles, each number representing one newspaper category (ads, news, culture_literature_stories_letters, appeals_donations_information, crime, finance, statistic). When you create your own collection, make sure you create a representative collection of the whole search result. If you use a long time period, make sure all timer periods are represented in your small collection.\n# The newspaper articles with the annotations should be saved as CSV.\n\n# ## Install packages in command line \n#\n# If you need help on how to pip install, have a look at this tutorial: https://packaging.python.org/tutorials/installing-packages/\n#\n# pip install gensim\n#\n# pip install PyLDAvis\n#\n# pip install spacy\n#\n# python -m spacy download de_core_web_sm\n#\n# pip install pandas\n#\n# pip install regex\n#\n# pip install nltk\n#\n# pip install matplotlib\n#\n# pip install numpy\n#\n# pip install seaborn\n#\n# pip install sklearn\n\n# ## Import packages \n#\n# Before you can get started, you have to install and import some packages.\n#\n# #### Make sure you use the version 1.9.0 with smart_open: python -m pip install --upgrade smart_open==1.9.0\n#\n\n#%%\n\n# more common imports\nimport matplotlib.axes as ax\nfrom nltk import FreqDist\nimport pandas as pd\nimport numpy as np\nfrom collections import Counter\nimport re\nimport sys\nimport time\n\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom gensim.utils import lemmatize, simple_preprocess\nimport spacy\n\n# model imports\nfrom gensim.models.ldamulticore import LdaMulticore\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\nfrom sklearn.linear_model import LogisticRegression\n\n# LDA\nimport nltk\nimport gensim\nfrom gensim.corpora import Dictionary\nimport gensim\nimport spacy\nimport logging\nimport warnings\nimport gensim.corpora as corpora\nfrom gensim.models import CoherenceModel\nfrom nltk.corpus import stopwords\nfrom gensim.models import LdaModel\nfrom gensim import models, corpora, similarities\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimport base64\nimport io\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Frequency\n\n#%%\n\npd.set_option('mode.chained_assignment', None)\n\n\n# ## Import your manually annotated newspaper collection \n# And have a look at your data\n\n#%%\n\ndf = pd.read_csv('export_classification_emigration_new_06_07_2020_23_15.csv')\n\nprint(df.shape)\ndf.head(3)\n\n\n# ### Check the distribution of your annotated categories.\n# Each category is assignet to a number:\n#\n# 0 = Advertisements\n#\n# 1 = News\n#\n# 2 = Culture, Literature, Stories, and Letters\n#\n# 3 = Appeals, Donations, and (help) informations\n#\n# 4 = Crime\n#\n# 6 = Finance\n#\n# 7 = Statistic\n#\n# These categories where specifically chosen for a collection on emigration between 1850 and 1950. For this corpus, about 80 articles for each categorie were sufficient to train a model that delivers good results. However, it is important that the corpus is representative for this specific topic.\n\n#%%\n\ndf.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Clean and tokenize the text (pre-processing) \n#\n# Before you can start with the training fo the topic models, you have to clean the text of your newspaper articles. The follwing functions remove punctuations, lower case the text, remove stop words and lemmatize the text.\n#\n# #### Stop words:\n# You can change the language used for the stop words. You can also add your own stop words or other words you would like to ignore. It helps to ignore your search keywords.\n\n#%%\n\n# Functions to clean, tokenize, and lemmatize the data\ndef initial_clean(text):\n text = re.sub(r'[^\\w\\s]', '', text)\n text = text.lower()\n text = nltk.word_tokenize(text)\n return text\n\n\nstop_words = stopwords.words('german') # change the language here\n# add your onw stop words\nstop_words.extend(['auswanderer', 'auswanderung', 'auswanderern'])\n\n\ndef remove_stop_words(text):\n return [word for word in text if word not in stop_words]\n\n\nstemmer = PorterStemmer()\n\n\ndef stem_words(text):\n try:\n text = [stemmer.stem(word) for word in text]\n text = [word for word in text if len(word) > 1]\n except IndexError:\n pass\n return text\n\n\ndef apply_all(text):\n return stem_words(remove_stop_words(initial_clean(text)))\n\n#%%\n\ndf['tokenized'] = df['text'].apply(apply_all)\n\n\n# ## Have a look at your data \n#\n# Check out, if everything went alright so far. Have a look at the number of words and their frequency distribution.\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf['doc_len'] = df['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df['doc_len'])\ndf.drop(labels='doc_len', axis=1, inplace=True)\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n#\n# ## Use your dataset to create a training corpus and test corpus \n#\n# Before we use our model on a bigger, unseen collection, we use our manual annotated dataset to train the models and classify the newspaper clippings. This helps to control the output (the annotations show if the automated classification has worked corretly) and to adapt the code in order to get the best results for your own collection.\n#\n# You can change the size of training and testing corpus by changing the number in following line: msk = np.random.rand(len(df)) < 0.899\n#\n\n#%%\n\n# create a mask of binary values\nmsk = np.random.rand(len(df)) < 0.899\n\n#%%\n\ntrain_df = df[msk]\ntrain_df.reset_index(drop=True, inplace=True)\n\ntest_df = df[~msk]\ntest_df.reset_index(drop=True, inplace=True)\n\n#%%\n\nprint(len(df), len(train_df), len(test_df))\n\n\n# ### Make sure, all categories have the same size\n# Therefore we shorten the training corpus to the number of the smallest category in the corpus. This is important so that the results are not distorted by over- or under-representation of a category.\n\n#%%\n\nval = train_df.relevancy.value_counts().min()\ntrain_df = train_df.groupby('relevancy').head(val)\n\n#%%\n\ntrain_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ### Have a look at the training corpus\n# Make sure there are enough articles of each category represented in the training corpus. The training corpus will be used to mesure the score of the classfication results by using the manual assigned information.\n\n#%%\n\ntest_df.relevancy.value_counts().plot(kind='bar')\n\n\n# ## Create topic models using your training corpus \n#\n# The function \"train_lda\" trains the lda model. You can change the parameters like number of topics or chunksize, but also the change of the alpha and eta parameters can change the results a lot. For the text classification, a high number of topics is best suited. Of course, this can change from research question to research question, and it makes sense to train your models with a changing number of topics to find out which amount works best for your collection.\n#\n# The program is doing also several passes of the data since this is a small dataset, so we want the distributions to stabilize.\n#\n# It is also important to note that changing the parameters may lead to better results for some categories but worse results for others. If an overall good result is important, the parameters should be adjusted accordingly. On the other hand, if a good result is important for certain categories, you can simply ignore the result of those you do not need. The score is calculated after the model has been trained and the collection classified. To find out, which parameters work the best for your corpus, you simply have to try out a view times and see what happens when you change the parameters. Every collection is different.\n#\n\n#%%\n\ndictionary = corpora.Dictionary(train_df['tokenized'])\n\n#%%\n\n# Make a BOW for every document (Bag of words)\ndef document_to_bow(df):\n train_df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), train_df['tokenized']))\n\n\ndocument_to_bow(train_df)\n\n#%%\n\ncorpus = train_df.bow\n\n#%%\n\ndef train_lda(data):\n num_topics = 500\n chunksize = 8000\n t1 = time.time()\n lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,\n chunksize=chunksize, minimum_probability=0.0, passes=50, iterations=500, per_word_topics=True)\n return dictionary, corpus, lda\n\n\n# #### The training of the topic models takes a few minutes. But it is worh the waiting time\n\n#%%\n\nget_ipython().run_cell_magic('time', '', 'dictionary,corpus,lda = train_lda(train_df)')\n\n\n# ## Have a look at your topics \n# Inspect the outcome of your topics. You can see all your topics in changing the topicid to the number of topic you want to see. You can also adapt the number of tokens (topn) by changing the number.\n\n#%%\n\nlda.show_topic(topicid=0, topn=20)\n\n\n# ## Now it is time to create the feature vector \n# Freature vectore is an n-dimensional vector of numerical features that represent some object. Many algorithms in machine learning require a numerical representation of objects, since such representations facilitate processing and statistical analysis.\n\n#%%\n\ndef document_to_lda_features(lda, document):\n \"\"\" Transforms a bag of words document to features.\n It returns the proportion of how much each topic was\n present in the document.\n \"\"\"\n topic_importances = lda.get_document_topics(\n document, minimum_probability=0)\n topic_importances = np.array(topic_importances)\n return topic_importances[:, 1]\n\n\ntrain_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n train_df.bow))\n\n#%%\n\nads_topic_distribution = train_df.loc[train_df.relevancy ==\n 0, 'lda_features'].mean()\nnews_topic_distribution = train_df.loc[train_df.relevancy == 1, 'lda_features'].mean(\n)\nculture_topic_distribution = train_df.loc[train_df.relevancy == 2, 'lda_features'].mean(\n)\nappeals_topic_distribution = train_df.loc[train_df.relevancy == 3, 'lda_features'].mean(\n)\ncrime_topic_distribution = train_df.loc[train_df.relevancy == 4, 'lda_features'].mean(\n)\nfinances_topic_distribution = train_df.loc[train_df.relevancy == 6, 'lda_features'].mean(\n)\nstatistic_topic_distribution = train_df.loc[train_df.relevancy == 7, 'lda_features'].mean(\n)\n\n\n# ## Have a look at the top words for each category \n\n#%%\n\ndef get_topic_top_words(lda_model, topic_id, nr_top_words=5):\n \"\"\" Returns the top words for topic_id from lda_model.\n \"\"\"\n id_tuples = lda_model.get_topic_terms(topic_id, topn=nr_top_words)\n word_ids = np.array(id_tuples)[:, 0]\n words = map(lambda id_: lda_model.id2word[id_], word_ids)\n return words\n\n#%%\n\nfor relevancy, distribution in zip(['ads', 'news', 'culture', 'appeals', 'crime', 'finances', 'statistic'], [ads_topic_distribution, news_topic_distribution, culture_topic_distribution, appeals_topic_distribution, crime_topic_distribution, finances_topic_distribution, statistic_topic_distribution]):\n print(\"Looking up top words from top topics from {}.\".format(relevancy))\n for x in sorted(np.argsort(distribution)[-5:]):\n top_words = get_topic_top_words(lda, x)\n print(\"For topic {}, the top words are: {}.\".format(\n x, \", \".join(top_words)))\n print(\"\")\n\n\n# ## Classification and hyperparameter tuning \n# After transforming the documents into features, it is important to apply a few supervised classifiers to be able to predict what text belongs to which category.\n\n#%%\n\ndef get_cross_validated_model(model, param_grid, X, y, nr_folds=6):\n \"\"\" Trains a model by doing a grid search combined with cross validation.\n args:\n model: your model\n param_grid: dict of parameter values for the grid search\n returns:\n Model trained on entire dataset with hyperparameters chosen from best results in the grid search.\n \"\"\"\n # train the model (since the evaluation is based on the logloss, we'll use neg_log_loss here)\n grid_cv = GridSearchCV(model, param_grid=param_grid,\n scoring='neg_log_loss', cv=nr_folds, n_jobs=-1, verbose=True)\n best_model = grid_cv.fit(X, y)\n # show top models with parameter values\n result_df = pd.DataFrame(best_model.cv_results_)\n show_columns = ['mean_test_score', 'rank_test_score']\n for col in result_df.columns:\n if col.startswith('param_'):\n show_columns.append(col)\n display(result_df[show_columns].sort_values(by='rank_test_score').head())\n return best_model\n\n#%%\n\n# we first have to transform every entry\nX_train_lda = np.array(list(map(np.array, train_df.lda_features)))\n\n\n# ## Using the test corpus \n# First, have a look at your test corpus\n\n#%%\n\ntest_df.head()\n\n\n# #### Pre-process your test corpus using the same function than for the train corpus\n\n#%%\n\ntest_df['tokenized'] = test_df['text'].apply(apply_all)\n\n\n# #### Make a bag of words for every document\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), test_df['tokenized']))\n\n\ndocument_to_bow(test_df)\n\n\n# #### Get feature vectores for your test corpus\n\n#%%\n\ntest_df['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n test_df.bow))\n\n#%%\n\nX_test_lda = np.array(list(map(np.array, test_df.lda_features)))\n\n#%%\n\n# store all models in a dictionary\nmodels = dict()\n\n\n# ## Logistic Regression \n# Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.\n\n#%%\n\nlr = LogisticRegression()\n\nparam_grid = {'penalty': ['l1', 'l2']}\n\nbest_lr_lda = get_cross_validated_model(\n lr, param_grid, X_train_lda, train_df.relevancy)\n\nmodels['best_lr_lda'] = best_lr_lda\n\n\n# ## Now it is time to make the classifications \n# First we get a data frame with the result for each category. The category with the highest number is the category to which the article is assigned.\n#\n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_test_lda)\n\n#%%\n\nresult = np.append(test_df.relevancy.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nsubmission_df = pd.DataFrame(data=result, columns=[\n 'relevancy', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n\n# #### Have a look if everything worked correctly\n# The first article contains the manual annotation (4.0), which means this article belongs to the category of crime. As you can see, the highest number for this row is in the column of crime. So this article has been classified correctly.\n\n#%%\n\nsubmission_df.head(5)\n\n\n# ## Calculate the score for each category as well as the overall score \n# Repeat the calculation (topic models need sometimes several rounds) or adapt the code until you get a higher score than 80 percent. If you get a higher score than 80 percent, you can continue with your whole dataset.\n\n#%%\n\nmax_num = submission_df.loc[:,\n submission_df.columns != 'relevancy'].max(axis=1)\n\n#%%\n\ndf_ads = submission_df[['ads', 'relevancy']\n ][submission_df['ads'].isin(max_num)]\n\nads_list = []\nfor key, value in df_ads.items():\n for rel in value:\n if len(str(rel)) < 4:\n ads_list.append(rel)\n\nads_right = []\nads_wrong = []\nfor num in ads_list:\n if num == 0.0:\n ads_right.append(num)\n else:\n ads_wrong.append(num)\nall_ = len(ads_right) + len(ads_wrong)\nads_score = len(ads_right) / all_\nprint(f\"Your score for ads is: {ads_score}\")\n\ndf_news = submission_df[['news', 'relevancy']\n ][submission_df['news'].isin(max_num)]\n\n\nnews_list = []\nfor key, value in df_news.items():\n for rel in value:\n if len(str(rel)) < 4:\n news_list.append(rel)\n\nnews_right = []\nnews_wrong = []\nfor num in news_list:\n if num == 1.0:\n news_right.append(num)\n else:\n news_wrong.append(num)\nall_ = len(news_right) + len(news_wrong)\nnews_score = len(news_right) / all_\nprint(f\"Your score for news is: {news_score}\")\n\n\ndf_culture = submission_df[['culture', 'relevancy']\n ][submission_df['culture'].isin(max_num)]\n\nculture_list = []\nfor key, value in df_culture.items():\n for rel in value:\n if len(str(rel)) < 4:\n culture_list.append(rel)\n\nculture_right = []\nculture_wrong = []\nfor num in culture_list:\n if num == 2.0:\n culture_right.append(num)\n else:\n culture_wrong.append(num)\nall_ = len(culture_right) + len(culture_wrong)\nculture_score = len(culture_right) / all_\nprint(f\"Your score for culture is: {culture_score}\")\n\ndf_appeals = submission_df[['appeals', 'relevancy']\n ][submission_df['appeals'].isin(max_num)]\n\nappeals_list = []\nfor key, value in df_appeals.items():\n for rel in value:\n if len(str(rel)) < 4:\n appeals_list.append(rel)\n\nappeals_right = []\nappeals_wrong = []\nfor num in appeals_list:\n if num == 3.0:\n appeals_right.append(num)\n else:\n appeals_wrong.append(num)\nall_ = len(appeals_right) + len(appeals_wrong)\nappeals_score = len(appeals_right) / all_\nprint(f\"Your score for appeals is: {appeals_score}\")\n\ndf_crime = submission_df[['crime', 'relevancy']\n ][submission_df['crime'].isin(max_num)]\n\ncrime_list = []\nfor key, value in df_crime.items():\n for rel in value:\n if len(str(rel)) < 4:\n crime_list.append(rel)\n\ncrime_right = []\ncrime_wrong = []\nfor num in crime_list:\n if num == 4.0:\n crime_right.append(num)\n else:\n crime_wrong.append(num)\nall_ = len(crime_right) + len(crime_wrong)\ncrime_score = len(crime_right) / all_\nprint(f\"Your score for crime is: {crime_score}\")\n\n\ndf_finances = submission_df[['finance', 'relevancy']\n ][submission_df['finance'].isin(max_num)]\n\nfinances_list = []\nfor key, value in df_finances.items():\n for rel in value:\n if len(str(rel)) < 4:\n finances_list.append(rel)\n\nfinances_right = []\nfinances_wrong = []\nfor num in finances_list:\n if num == 6.0:\n finances_right.append(num)\n else:\n finances_wrong.append(num)\nall_ = len(finances_right) + len(finances_wrong)\nfinance_score = len(finances_right) / all_\nprint(f\"Your score for finances is: {finance_score}\")\n\ndf_statistic = submission_df[['statistic', 'relevancy']\n ][submission_df['statistic'].isin(max_num)]\n\nstatistic_list = []\nfor key, value in df_statistic.items():\n for rel in value:\n if len(str(rel)) < 4:\n statistic_list.append(rel)\n\nstatistic_right = []\nstatistic_wrong = []\nfor num in statistic_list:\n if num == 7.0:\n statistic_right.append(num)\n else:\n statistic_wrong.append(num)\nall_ = len(statistic_right) + len(statistic_wrong)\nstatistic_score = len(statistic_right) / all_\nprint(f\"Your score for statistic is: {statistic_score}\")\noverall_score = (ads_score + news_score + culture_score +\n appeals_score + crime_score + finance_score + statistic_score) / 7\n\nprint(f\"Your overall score is {overall_score}\")\n\n\n# # If your overall score is higher than 80 percent, you can start to use your whole collection \n#\n# Start with importing your whole collection. Import the same collection twice for the futher processing.\n#\n# Note: If you are mainly interested in one of the catecories, it makes sense to choose a model with a high score for that category. If you want a good overview of the distribution of the categories, a overall good score is more important.\n\n#%%\n\ndf_all = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\ndf_all_2 = pd.read_csv('export_auswanderer_06_07_2020_22_38.csv', usecols=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text'])\n\n\n# ### Have a look at your data\n\n#%%\n\ndf_all.head()\n\n\n# ## Clean (pre-process) your whole collection \n#\n# You repeat the same steps you did with your training and test corpus\n\n#%%\n\ndf_all['tokenized'] = df_all['text'].apply(apply_all)\n\n\n# #### Again, have a look at your data\n\n#%%\n\n# first get a list of all words\nall_words = [word for item in list(df_all['tokenized']) for word in item]\n# use nltk fdist to get a frequency distribution of all words\nfdist = FreqDist(all_words)\nf\"The number of unique words is {len(fdist)}\"\n\n#%%\n\n# document length\ndf_all['doc_len'] = df_all['tokenized'].apply(lambda x: len(x))\ndoc_lengths = list(df_all['doc_len'])\ndf_all.drop(labels='doc_len', axis=1, inplace=True)\n\n\nprint(f\"length of list: {len(doc_lengths)}\")\nprint(f\"average document length: {np.average(doc_lengths)}\")\nprint(f\"minimum document length: {min(doc_lengths)}\")\nprint(f\"maximum document length: {max(doc_lengths)}\")\n\n\n# #### Remove articles that are smaller than 5 tokens\n\n#%%\n\ndf_all = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all = df_all[df_all['tokenized'].map(type) == list]\ndf_all.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\ndf_all_2 = df_all[df_all['tokenized'].map(len) >= 5]\ndf_all_2 = df_all[df_all['tokenized'].map(type) == list]\ndf_all_2.reset_index(drop=True, inplace=True)\nprint(\"After cleaning and excluding short aticles, the dataframe now has:\",\n len(df_all), \"articles\")\n\n\n# #### Make a BOW (bag of words) for every document and get feature vectores for your whole collection\n\n#%%\n\ndef document_to_bow(df):\n df['bow'] = list(\n map(lambda doc: dictionary.doc2bow(doc), df_all['tokenized']))\n\n\ndocument_to_bow(df_all)\n\n#%%\n\ndf_all['lda_features'] = list(map(lambda doc:\n document_to_lda_features(lda, doc),\n df_all.bow))\n\n#%%\n\nX_all_lda = np.array(list(map(np.array, df_all.lda_features)))\n\n\n# ## Now it is time to make the classifications for the whole collection \n\n#%%\n\nsubmission_predictions = best_lr_lda.predict_proba(X_all_lda)\n\n\n# To be able to create two different outputs, one with the results per category to check the result [57] and one with the results in the form of your original file [61], two different types of results are created.\n\n#%%\n\nresult = np.append(df_all.text.values.reshape(-1, 1),\n submission_predictions, axis=1)\n\n#%%\n\nresult_2 = np.append(df_all_2, submission_predictions, axis=1)\n\n#%%\n\nsubmission_df_all = pd.DataFrame(data=result, columns=[\n 'text', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\nsubmission_df_all_2 = pd.DataFrame(data=result_2, columns=[\n 'id', 'language', 'date', 'newspaper_id', 'iiif_url', 'text', 'token', 'ads', 'news', 'culture', 'appeals', 'crime', 'finance', 'statistic'])\n\n#%%\n\nsubmission_df_all.head(5)\n\n\n# ## Create a dataframe with the results \n\n#%%\n\nmax_num = submission_df_all.loc[:,\n submission_df_all.columns != 'text'].max(axis=1)\n\n#%%\n\nads = submission_df_all[['text']][submission_df_all['ads'].isin(max_num)]\nnews = submission_df_all[['text']][submission_df_all['news'].isin(max_num)]\nculture = submission_df_all[['text']\n ][submission_df_all['culture'].isin(max_num)]\nappeals = submission_df_all[['text']\n ][submission_df_all['appeals'].isin(max_num)]\ncrime = submission_df_all[['text']][submission_df_all['crime'].isin(max_num)]\nfinance = submission_df_all[['text']\n ][submission_df_all['finance'].isin(max_num)]\nstatistic = submission_df_all[['text']\n ][submission_df_all['statistic'].isin(max_num)]\n\n#%%\n\n# transform your lists into a dataframe\n\ndf_ads = pd.DataFrame(np.column_stack([ads]),\n columns=['Ads'])\n\n\ndf_news = pd.DataFrame(np.column_stack([news]),\n columns=['News'])\n\ndf_culture = pd.DataFrame(np.column_stack([culture]),\n columns=['Culture_Literature'])\n\ndf_appeals = pd.DataFrame(np.column_stack([appeals]),\n columns=['Appeals_Donations'])\n\ndf_crime = pd.DataFrame(np.column_stack([crime]),\n columns=['Crime'])\n\n\ndf_finance = pd.DataFrame(np.column_stack([finance]),\n columns=['Finance'])\n\ndf_statistic = pd.DataFrame(np.column_stack([statistic]),\n columns=['Statistic'])\n\ndf_results = pd.concat([df_ads, df_news, df_culture, df_appeals,\n df_crime, df_finance, df_statistic], ignore_index=True, axis=1)\ndf_results.columns = ['Ads', 'News', 'Culture_Literatur',\n 'Appeals_Donations', 'Crime', 'Finance', 'Statistic']\ndf_results[15:20]\n\n\n# ### Now export your dataframe in order to check the results\n\n#%%\n\ndf_results.to_excel(\"results_emigration.xlsx\")\n\n\n# ### Get the dates for your classified articles\n# You will need them later for the visualization\n\n#%%\n\n# Extract the dates for the visualization for every category\ndate_ads = []\nads = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['ads'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['ads'].isin(max_num)]\nfor key in date:\n date_ads.append(key[:4])", "original_comment": "# Transform the dates from strings to integers\n", "target_code": "for i in range(0, len(date_ads)):\n date_ads[i] = int(date_ads[i])\n", "project_metadata": {"full_name": "NewsEye/NLP-Notebooks-Newspaper-Collections", "description": "A collection of notebooks for Natural Language Processing", "topics": ["lda", "topic-modeling", "shannon", "nlp-notebooks", "digital-humanities", "newspaper-collections", "newspaper-clippings", "text-classification", "similarity"], "git_url": "git://github.com/NewsEye/NLP-Notebooks-Newspaper-Collections.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-07-06T11:18:13Z", "size": 12866, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4306857}, "last_updated": "2020-12-01T08:54:40Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "date_news = []\nnews = submission_df_all_2[['id', 'language', 'date', 'newspaper_id',\n 'iiif_url', 'text']][submission_df_all['news'].isin(max_num)]\ndate = submission_df_all_2['date'][submission_df_all['news'].isin(max_num)]\nfor key in date:\n date_news.append(key[:4])\n", "model": "no-comments", "intent": "# Transform the dates from strings to integers"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n\ndf_reviews.head()\n\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n#%%\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n#%%\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n#%%\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n#%%\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n#%%\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n#%%\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n#%%\n\ndf_reviews.head()\n\n#%%\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n#%%\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n#%%\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n#%%\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n#%%\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n#%%", "original_comment": "# Find value counts for every column in reviews.csv\n", "target_code": "df_reviews.apply(lambda x: x.isnull().value_counts())\n", "project_metadata": {"full_name": "sheetalbongale/ALE-gorithm", "description": "All things Beer! Beer Educator and Recommender Web App | Deployed on GCP > https://alegorithm-fxljyqhslq-uc.a.run.app/ | UT Data Analysis and Visualization Nov 2019 - May 2020. ", "topics": ["recommender", "gcp-cloud-build", "python-flask-application", "sqlalchemy", "plotlyjs", "anychart-javascript-library", "d3js", "mysql"], "git_url": "git://github.com/sheetalbongale/ALE-gorithm.git", "stars": 5, "watchers": 5, "forks": 5, "created": "2020-03-01T22:59:58Z", "size": 56307, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 204948, "JavaScript": 52038, "CSS": 48412, "HTML": 46213, "Python": 15403, "Dockerfile": 433}, "last_updated": "2020-05-07T08:39:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "print(df_beers.shape)\ndf_beers.head()\n", "model": "natural", "intent": "# Find value counts for every column in reviews.csv"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n\ndf['age'] = 2017 - df.yearbuilt\n\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n\ntrain.cluster_loc.value_counts()\n\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n\ntest.cluster_home.value_counts()\n\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\nless_significant_clusters\n\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n\ntrain.head()\n\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n\ntrain_no_clusters.head()\n\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n#%%\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n#%%\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n#%%\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n#%%\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n#%%\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n#%%\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n#%%\n\ndf['age'] = 2017 - df.yearbuilt\n\n#%%\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n#%%\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n#%%\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n#%%\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n#%%\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n#%%\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n#%%\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n#%%\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n#%%\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n#%%\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n#%%\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n#%%\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n#%%\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n#%%\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n#%%\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n#%%\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n#%%\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n#%%\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n#%%\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n#%%\n\ntrain.cluster_loc.value_counts()\n\n#%%\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n#%%\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n#%%\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n#%%\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n#%%\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n#%%\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n#%%\n\ntest.cluster_home.value_counts()\n\n#%%\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n#%%\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\nless_significant_clusters\n\n#%%\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n#%%\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n#%%\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n#%%\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n#%%\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n#%%\n\ntrain.head()\n\n#%%\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n#%%\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n#%%\n\ntrain_no_clusters.head()\n\n#%%\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n#%%\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n#%%\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n#%%\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n#%%\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n#%%\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n#%%\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n#%%\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n#%%\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n#%%\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#", "original_comment": "# ##### Linear Support Vector Regressor from sklearn.svm\n", "target_code": "regr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\n", "project_metadata": {"full_name": "CodeupClassroom/bayes-methodologies-exercises", "description": "Bayes exercises on methodologies", "topics": [], "git_url": "git://github.com/CodeupClassroom/bayes-methodologies-exercises.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2019-10-09T14:04:48Z", "size": 13779, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 17490873, "Python": 71621}, "last_updated": "2020-01-06T20:54:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "svm = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\n# Fit model according to given training data\nsvm.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = svm.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_\n", "model": "docstring", "intent": "# Linear Support Vector Regressor from sklearn.svm"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n#%%\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n#%%\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n#%%\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n#%%\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n#%%\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n#%%\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n#%%\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n#%%\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)", "original_comment": "# ## Prepare a train/test set for Validating Stacking\n", "target_code": "from sklearn.model_selection import train_test_split\n\ntrain_ = train.fillna(-999)\ntest_ = test.fillna(-999)\ntraining, testing = train_test_split(train_, test_size=0.2, random_state=42)\n", "project_metadata": {"full_name": "liujiashen9307/KaggleCompetition", "description": "Code hub for the kaggle competitions I have participated in.", "topics": [], "git_url": "git://github.com/liujiashen9307/KaggleCompetition.git", "stars": 6, "watchers": 6, "forks": 10, "created": "2016-10-12T21:10:54Z", "size": 15258, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16811198, "HTML": 14162298, "Python": 1658600, "R": 8306}, "last_updated": "2020-02-01T03:33:11Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "cols_to_drop = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\ntrain = train.drop(cols_to_drop, axis=1)\ntest = test.drop(cols_to_drop\n", "model": "no-comments", "intent": "# Prepare a train/test set for Validating Stacking"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n#%%\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing", "original_comment": "# Let's first look at the data frame.\n", "target_code": "rating.head()\n", "project_metadata": {"full_name": "liyenhsu/restaurant-data-with-consumer-ratings", "description": "Build recommender systems for restaurants", "topics": [], "git_url": "git://github.com/liyenhsu/restaurant-data-with-consumer-ratings.git", "stars": 3, "watchers": 3, "forks": 4, "created": "2017-11-09T05:11:58Z", "size": 1373, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1230183}, "last_updated": "2020-10-11T20:40:42Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "ratings = pd.read_csv('data/ratings_final.csv')\n", "model": "docstring", "intent": "# look at the data frame"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---\n\n\n\nif lid.shape[0] > 0:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n#%%\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n#%%\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n#%%\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n#%%\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n#%%\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n#%%\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n#%%\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n#%%\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n#%%\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---", "original_comment": "# ## C. Calculate the Lateral Inflow Hydrographs:\n", "target_code": " ReducedTable = calc_lateral_inflow_hydro(\n lid, ReducedTable, StormwaterTable, durations, BCN, display_print)\n", "project_metadata": {"full_name": "Dewberry/pfra-hydromet", "description": "Tools for developing pluvial (excess rainfall) and fluvial scenarios for probabilistic flood risk analyses", "topics": ["hydrology", "papermill", "montecarlo-simulation"], "git_url": "git://github.com/Dewberry/pfra-hydromet.git", "stars": 11, "watchers": 11, "forks": 12, "created": "2019-04-18T13:04:55Z", "size": 165396, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 59869977, "Python": 186157}, "last_updated": "2020-10-27T14:37:20Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "ReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n ReducedTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n", "model": "docstring", "intent": " # C. Calculate the Lateral Inflow Hydrographs:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n#%%\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n#%%\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n#%%\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n#%%\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n#%%\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n#%%\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n#%%\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]", "original_comment": "# Merge the obtained results with the pulls DataFrame\n", "target_code": "joined_pr = file_pr.merge(pulls, on='pid')\n", "project_metadata": {"full_name": "ChristianNogueira/datacamp_projects", "description": "DataCamp Projects", "topics": ["datacamp"], "git_url": "git://github.com/ChristianNogueira/datacamp_projects.git", "stars": 17, "watchers": 17, "forks": 13, "created": "2018-01-17T16:58:27Z", "size": 8441, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12129948}, "last_updated": "2020-08-21T20:03:31Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "joined_pr = pd.merge(file_pr, pulls, on='pid')\n# Find the users of the last 10 most recent pull requests\nusers_last_10 = set(joined_pr.nlargest(10, 'date')['user'])\n# Printing the results\nusers_last_10\n", "model": "natural", "intent": "# Merge the obtained results with the pulls DataFrame"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploratory Data Analysis\n# ![EDA_open](../images/eda_open.jpg)\n\n# # Program so far\n# ***\n# * Python Data Science Tool box\n# * Introduction to machine learning\n# * Summarizing the Data\n# * Art of Statistical Inference\n# * Linear Regression\n\n# # What are we going to learn today?\n# ***\n# - Initial Exploration\n# - Introduction to Seaborn\n# - Univariate Analysis\n# - Multi-variate Analysis\n# - Scaling, Centering, Skewness\n# - Basic data cleaning and Preprocessing\n# - Feature extraction and Feature engineering\n\n# Until now all the data that we have seen so far has been clean or pre-cleaned. In real-life we rarely get such clean datasets.\n\n# ## John's Concerns\n# ***\n# After dealing with outliers, John realised the significance of a clean data set. So he decided to learn more about data cleaning and data manipulation.\n#\n# He used the data he had collected so far.\n#\n# Let's see how John proceeds!\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn import preprocessing\nfrom sklearn.impute import SimpleImputer\nfrom scipy.stats import norm, skew\nfrom scipy import stats\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n# For some Statistics\n\n#%%", "original_comment": "# Let's import the set\n", "target_code": "import pandas as pd\n\ndf = pd.read_csv('../data/train.csv', index_col=0)\n", "project_metadata": {"full_name": "commit-live-students/GLabs_DSMX", "description": null, "topics": [], "git_url": "git://github.com/commit-live-students/GLabs_DSMX.git", "stars": 6, "watchers": 6, "forks": 23, "created": "2020-03-27T12:43:39Z", "size": 19480, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12966885}, "last_updated": "2020-12-24T07:12:28Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "train = pd.read_csv('train.csv')\ntest = pd.read_csv('test.csv')\n", "model": "no-comments", "intent": "# Let's import the set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n#%%\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n#%%\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n#%%\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n#%%\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n#%%\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n#%%\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n#%%\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n#%%\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n#%%\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n#%%\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n#%%\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n#%%\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n#%%\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n#%%\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n#%%\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n#%%\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n#%%\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n#%%\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n#%%\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n#%%\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n#%%\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n#%%", "original_comment": "# Histograms of test input data by column\n", "target_code": "fig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_test[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_test[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n", "model": "natural", "intent": "# Histograms of test input data by column"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n#%%\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n#%%\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n#%%\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()", "original_comment": "# #### Check Missing Value\n", "target_code": "for col in df_train.columns:\n if any(df_train[col].isnull()):\n print(\"feature %s, missing %i entries\" %\n (col, sum(df_train[col].isnull())))\n else:\n print(\"feature %s has no missing value\" % col)\n", "project_metadata": {"full_name": "shawlu95/Data-Science-Toolbox", "description": "Examples and illustration of basic statistic concepts, probability distribution, Monte Carlo simulation, preprocessing and visualization techniques, and statistical testing.", "topics": [], "git_url": "git://github.com/shawlu95/Data-Science-Toolbox.git", "stars": 28, "watchers": 28, "forks": 11, "created": "2019-03-25T19:58:55Z", "size": 157445, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 52401937, "Python": 36992, "TSQL": 3834, "PLpgSQL": 3609, "Shell": 3459, "R": 1437}, "last_updated": "2020-12-26T18:51:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "coverage": "Strongly agree", "coverage-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_pred.isnull().sum()\n", "model": "natural", "intent": "# Check Missing Value"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n\nprint(spam.head())\n\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n\nprint(data_set[:5])\n\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n\nprint(training_set[:5])\n\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n#%%\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n#%%\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n#%%\n\nprint(spam.head())\n\n#%%\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n#%%\n\nprint(data_set[:5])\n\n#%%\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n#%%\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n#%%\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n#%%\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n#%%\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n#%%\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n#%%\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n#%%\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n#%%\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n#%%\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n#%%\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n#%%\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n#%%\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n#%%\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n#%%\n\nprint(training_set[:5])\n\n#%%\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n#%%", "original_comment": "# Training the classifier with NaiveBayes algorithm\n", "target_code": "spamClassifier = nltk.NaiveBayesClassifier.train(training_set)\n", "project_metadata": {"full_name": "beingdatum/NaturalLanguageProcessing", "description": null, "topics": [], "git_url": "git://github.com/beingdatum/NaturalLanguageProcessing.git", "stars": 3, "watchers": 3, "forks": 10, "created": "2020-01-01T13:54:22Z", "size": 23376, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2267856, "Python": 1378}, "last_updated": "2020-06-08T09:54:47Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", "model": "docstring", "intent": "# Training the classifier with NaiveBayes algorithm"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n#%%\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n#%%\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n#%%\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n#%%\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n#%%\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)", "original_comment": "# print git status for the local repository\n", "target_code": "status_command = \"git status\"\noutput = subprocess.check_output(status_command, shell=True)\nprint(output)\n", "project_metadata": {"full_name": "uzh/helmchen-spark", "description": "Playbooks and other files to build a (virtual) Spark cluster for Prof. Helmchen's research group", "topics": [], "git_url": "git://github.com/uzh/helmchen-spark.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2016-03-23T21:54:52Z", "size": 6519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2819538, "Python": 37375, "Shell": 3482}, "last_updated": "2019-12-15T16:09:17Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "repo_git_commit = subprocess.check_output(\n [\"git\", \"-C\", repo_path, \"commit\"])\nprint(repo_git_commit)\n", "model": "docstring", "intent": "# print git status for the local repository"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n\niris['sepal length (cm)'].count() # Applied to Series\n\n\niris['sepal width (cm)'].count() # Applied to Series\n\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n\niris.min()\n\n\niris.max()\n\n\n# #### Box Plots\n\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf.fare.describe()\n\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n\n# Print the mean of the January and March data\njanuary.mean()\n\n\nmarch.mean()\n\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n\nsetosa['species'].unique()\n\n\nversicolor['species'].unique()\n\n\nvirginica['species'].unique()\n\n\nsetosa.head(2)\n\n\nversicolor.head(2)\n\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n\ndescribe_all = iris.describe()\ndescribe_all\n\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n\nsales.reindex(evening_2_11, method='ffill')\n\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n\ndf1.head()\n\n\ndf2.head()\n\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n\ndaily_mean.loc['2015-2-2']\n\n\nsales.loc['2015-2-2', 'Units']\n\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n\nsales.resample('D').sum().head()\n\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\ndf_clean.head(3)\n\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n#%%\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n#%%\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n#%%\n\niris['sepal length (cm)'].count() # Applied to Series\n\n#%%\n\niris['sepal width (cm)'].count() # Applied to Series\n\n#%%\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n#%%\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n#%%\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n#%%\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n#%%\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n#%%\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n#%%\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n#%%\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n#%%\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n#%%\n\niris.min()\n\n#%%\n\niris.max()\n\n\n# #### Box Plots\n\n#%%\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n#%%\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n#%%\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n#%%\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n#%%\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf.fare.describe()\n\n#%%\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n#%%\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n#%%\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n#%%\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n#%%\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n#%%\n\n# Print the mean of the January and March data\njanuary.mean()\n\n#%%\n\nmarch.mean()\n\n#%%\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n#%%\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n#%%\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n#%%\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n#%%\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n#%%\n\nsetosa['species'].unique()\n\n#%%\n\nversicolor['species'].unique()\n\n#%%\n\nvirginica['species'].unique()\n\n#%%\n\nsetosa.head(2)\n\n#%%\n\nversicolor.head(2)\n\n#%%\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n#%%\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n#%%\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n#%%\n\ndescribe_all = iris.describe()\ndescribe_all\n\n#%%\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n#%%\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n#%%\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n#%%\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n#%%\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n#%%\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n#%%\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n#%%\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n#%%\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n#%%\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n#%%\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n#%%\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n#%%\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n#%%\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n#%%\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n#%%\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n#%%\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n#%%\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n#%%\n\nsales.reindex(evening_2_11, method='ffill')\n\n#%%\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf2.head()\n\n#%%\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n#%%\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n#%%\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n#%%\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n#%%\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n#%%\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n#%%\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n#%%\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n#%%\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n#%%\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n#%%\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n#%%\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n#%%\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n#%%\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n#%%\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n#%%\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n#%%\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n#%%\n\ndaily_mean.loc['2015-2-2']\n\n#%%\n\nsales.loc['2015-2-2', 'Units']\n\n#%%\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n#%%\n\nsales.resample('D').sum().head()\n\n#%%\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n#%%\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n#%%\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n#%%\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n#%%\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n#%%\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n#%%\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n#%%\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n#%%\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n#%%\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n#%%\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n#%%\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n#%%\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n#%%\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n#%%\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n#%%\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n#%%\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n#%%\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n#%%\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n#%%\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n#%%\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n#%%\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n#%%\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n#%%\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n#%%\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n#%%\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n#%%\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n#%%\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n#%%\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n#%%\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n#%%\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n#%%\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n#%%\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n#%%\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n#%%\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n#%%\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n#%%\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n#%%\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n#%%\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n#%%\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n#%%\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n#%%\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n#%%\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n#%%\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n#%%\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n#%%\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n#%%\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n#%%\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n#%%\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n#%%\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n#%%\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n#%%\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n#%%\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n#%%\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n#%%\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n#%%\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n#%%\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n#%%\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n#%%\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n#%%\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n#%%\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n#%%\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n#%%\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n#%%\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n#%%\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n#%%\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n#%%\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n#%%\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n#%%\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n#%%\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n#%%\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\ndf_clean.head(3)\n\n#%%\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n#%%\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n#%%\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n#%%\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n#%%\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n#%%\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n#%%", "original_comment": "# Calculate the mean of overcast_daily_max\n", "target_code": "overcast_daily_max_mean = overcast_daily_max.mean()\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "overcast_daily_max_mean = overcast_daily_max.mean()\novercast_daily_max_mean\n", "model": "natural", "intent": "# Calculate the mean of overcast_daily_max"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nfrom pzblend import PhotozBlend\n\n\n# ### Load catalogs\n\n#%%\n\n# - necessary file paths\ncoadd_filename = 'data/coadd_data_tract_3830_cutout.parquet'\ntruth_filename = 'data/truth_data_hpix_9685_cutout.parquet'\nzgrid_filename = 'data/zgrid.npy'\n\n# - read in the truth and coadd catalogs (make sure you have pyarrow installed)\ntruth_df = pd.read_parquet(truth_filename, engine='pyarrow')\ncoadd_df = pd.read_parquet(coadd_filename, engine='pyarrow')\n\n# - now read in photoz pdf bin centers\nzgrid = np.load(zgrid_filename)\n\n#%%\n\n# view the coadd dataframe (DC2_run2.2i including photoz data -- all objects)\ncoadd_df\n\n#%%\n\n# view the truth dataframe (CosmoDC2.v.1.1.4 -- all galaxies)\ntruth_df\n\n\n# ### Define cuts\n\n#%%\n\n# Let's define our quality cuts\n\n# - quality cuts on the truth dataframe\ntruth_cuts = [\n 'mag_i_lsst.notna()', # remove nan magnitudes\n 'mag_i_lsst < 27', # apply a magnitude cut\n]\n\n# - quality cuts on the coadd dataframe\nbasic_cuts = [\n 'extendedness > 0', # select the extended objects\n 'mag_i.notna()', # select objects that have i-band magnitudes\n # 'clean', # the source has no flagged pixels (interpolated, saturated, edge, clipped...)\n # and was not skipped by the deblender # (good && ~deblend_skipped) < already applied! >\n 'xy_flag == 0' # bad centroiding\n]\n\nlensing_cuts = [\n # (from this and below) remove nan entries\n 'i_modelfit_CModel_instFlux.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_resolution.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e1.notna()',\n 'ext_shapeHSM_HsmShapeRegauss_e2.notna()',\n 'snr_i_cModel >= 10',\n # (from this and below) basic flag cuts\n 'detect_isPrimary',\n 'deblend_skipped == False',\n 'base_PixelFlags_flag_edge == False',\n 'base_PixelFlags_flag_interpolatedCenter == False',\n 'base_PixelFlags_flag_saturatedCenter == False',\n 'base_PixelFlags_flag_crCenter == False',\n 'base_PixelFlags_flag_bad == False',\n 'base_PixelFlags_flag_suspectCenter == False',\n 'base_PixelFlags_flag_clipped == False',\n 'ext_shapeHSM_HsmShapeRegauss_flag == False',\n 'ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3',\n 'shape_hsm_regauss_etot < 2',\n 'ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4',\n 'mag_i_cModel < 24', # FIXME: Doesn't have extinction correction?\n 'base_Blendedness_abs < 10**(-0.375)'\n]\n\n\n# ### Feed data to the class\n\n#%%\n\n# - create an instance of the PhotozBlend class\npzb = PhotozBlend(truth_df, coadd_df, zgrid)\n\n#%%\n\n# - apply the cuts we defined earlier\npzb.apply_truth_cuts(truth_cuts)\npzb.apply_coadd_cuts(basic_cuts+lensing_cuts)\n\n# - do the friends of friends matching\npzb.fof_match(verify=True, plot=True, save_cached=True) # load_cached=True\n\n#%%\n\n# - let's take a look at the results of fof-matching\npzb.fof_results\n\n\n# ### Plotting\n\n#%%\n\n# in all the plotting functions you can use save_plot=True to save the plots\n# force_refresh=True tries to forcefully recompute things for the new plot (not usually needed but just in case)\n# use_latest=True uses the latest parameters and settings\n# the default behaviour is to use the latest values unless you explicitly pass a keyword argument\n\n# - zz plot\npzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1,\n truth_pick='bright', colorbar='vertical')\n\n# - zz plot (with more plotting options)\n# pzb.plot_zz(pz_type='z_mode', num_truth=2, num_coadd=1, truth_pick='bright',\n# xlim=(0,3), ylim=(0,3), fig=None, figsize=None, ax=None, colorbar='vertical',\n# pad='0.9%', cmap=plt.cm.Spectral_r, annotate=1, colorbar_lim=(None,None))\n\n#%%\n\n# - plot photo-z stacked pdf vs true-z histogram\n# note that it inherited some parameters from the previous plotting command\n# (you can always update them for each plot if you want)\npzb.plot_pdf(kde_bandwidth='scott')\n\n# - with more options, e.g. using ML to choose a cross-validated bandwidth given a list of bandwidths to select from (takes longer!)\n# pzb.plot_pdf(kde_bandwidth=np.logspace(0.01,0.2,20), n_iter=15, n_jobs=-1)\n\n#%%\n\n# - plot the PIT histogram\npzb.plot_pit(save_plot=1)", "original_comment": "# ### Put them all together in multiplots\n", "target_code": "pzb.plot_multi(names=['fof', 'zz', 'pdf', 'pit'], verbose=1, save_plot=1)\n", "project_metadata": {"full_name": "LSSTDESC/pz_blend", "description": "impact of blending on photo-zs using DC2 truth catalogs and image catalogs", "topics": [], "git_url": "git://github.com/LSSTDESC/pz_blend.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-03-12T22:06:14Z", "size": 2183, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1709826, "Python": 89195}, "last_updated": "2020-12-09T18:50:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))\nax = axes[0, 0]\nax = axes[0, 1]\nax = axes[1, 0]\nax = axes[1, 1]\nax = axes[0, 0]\nax = axes[0, 1]\nax = axes[1, 0]\nax = axes[1, 1]\nax = axes[0, 0]\nax = axes[0, 1]\nax = axes[1, 0]\nax = axes[1, 1]\nax = axes[0, 0]\nax = axes\n", "model": "natural", "intent": "# Put them all together in multiplots"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\nimport nltk\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n#%%\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n#%%\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n#%%\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n#%%\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''", "original_comment": " # Tokenize words in reviews:\n", "target_code": " import nltk\n\n df_with_reviews['rev_cleaned'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: nltk.word_tokenize(x))\n", "project_metadata": {"full_name": "MeredithLevsen/InsightProject", "description": "GameOn - Quickly evaluate board games based on user reviews", "topics": [], "git_url": "git://github.com/MeredithLevsen/InsightProject.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2018-07-17T17:31:15Z", "size": 541, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1045270, "HTML": 265408}, "last_updated": "2018-12-04T03:47:10Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "tokenizer = RegexpTokenizer(r'\\w+')\n", "model": "docstring", "intent": " # Tokenize words in reviews:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n#%%\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n#%%\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n#%%\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)", "original_comment": " # extract dimensions\n", "target_code": " original_image_height, original_image_width = gray_img.shape\n", "project_metadata": {"full_name": "howarder3/ironman2020_OpenCV_photoshop", "description": null, "topics": [], "git_url": "git://github.com/howarder3/ironman2020_OpenCV_photoshop.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-12T15:55:03Z", "size": 125635, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 131231786}, "last_updated": "2020-12-23T03:20:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "img_width, img_height = origin_img.shape[0], origin_img.shape[1]\n", "model": "natural", "intent": " # extract dimensions"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n\nic.fit(inputs=s3_channels)\n\n\n\nendpoint_name = 'c5-'+time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n#%%\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n#%%\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n#%%\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n#%%\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n#%%\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n#%%\n\nic.fit(inputs=s3_channels)", "original_comment": "# ### Deploy the model\n", "target_code": "c5_predictor = ic.deploy(initial_instance_count=1,\n instance_type='ml.c5.large',\n endpoint_name=endpoint_name,\n wait=False)\n", "project_metadata": {"full_name": "PacktPublishing/Learn-Amazon-SageMaker", "description": "Learn Amazon SageMaker", "topics": [], "git_url": "git://github.com/PacktPublishing/Learn-Amazon-SageMaker.git", "stars": 30, "watchers": 30, "forks": 20, "created": "2020-04-22T14:55:25Z", "size": 47447, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2818256, "Python": 146100, "R": 2078, "Dockerfile": 738}, "last_updated": "2020-12-29T08:53:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "ic_classifier = ic.deploy(initial_instance_count=1,\n instance_type='ml.m4.xlarge')\n", "model": "no-comments", "intent": "# Deploy the model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n\nout\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n\ntrain, validation = get_MNIST_data()\n\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n#%%\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n#%%\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n#%%\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n#%%\n\nout\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n#%%\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n#%%\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n#%%\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n#%%", "original_comment": "# Shifted dataset\n", "target_code": "train_20, validation_20 = get_MNIST_data(shift=20)\ntrain_20 = rescale(train_20)\nvalidation_20 = rescale(validation_20)\n", "project_metadata": {"full_name": "elahea2020/6.036", "description": "Homework solutions of Intro to ML course at MIT Spring 2018", "topics": ["ml", "machine-learning", "machine-learning-algorithms", "mit", "6036", "perceptron-learning-algorithm", "rnn"], "git_url": "git://github.com/elahea2020/6.036.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2018-05-08T21:21:54Z", "size": 65530, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 18939819, "Python": 168769}, "last_updated": "2020-10-25T08:09:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "\n", "model": "no-comments", "intent": "# Shifted dataset"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n\n# View one of the records\ndata.take(1)\n\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n\nlr_model = lr.fit(train)\n\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n#%%\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n#%%\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n#%%\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n#%%\n\n# View one of the records\ndata.take(1)\n\n#%%\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n#%%\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n#%%\n\nlr_model = lr.fit(train)\n\n#%%\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n#%%\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n#%%\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n#%%\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n#%%", "original_comment": "# Area under the ROC\n", "target_code": "print(\"Area Under ROC = %.2f\" % metrics.areaUnderROC)\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "print(\"Accuracy: \" + str(metrics.accuracy))\n", "model": "no-comments", "intent": "# Area under the ROC"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n#\n# Finetune the Inception V3 network on the CDiscount dataset.\n#\n# Taken from https://keras.io/applications/#usage-examples-for-image-classification-models\n\n#%%\n\nfrom keras.optimizers import SGD\nimport os\nimport pickle\nimport itertools\nimport io\nimport time\nimport bson\nimport threading\n\nimport pandas as pd\nfrom scipy.misc import imread\nimport numpy as np\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.applications.inception_v3 import InceptionV3\nfrom keras.preprocessing import image\nfrom keras.models import Model\nfrom keras.layers import Dense, GlobalAveragePooling2D\nfrom keras import backend as K\nimport keras\n\n#%%\n\ndef create_model(num_classes=None):\n # create the base pre-trained model\n base_model = InceptionV3(weights='imagenet', include_top=False)\n\n # add a global spatial average pooling layer\n x = base_model.output\n x = GlobalAveragePooling2D()(x)\n # let's add a fully-connected layer\n x = Dense(4096, activation='relu')(x)\n # and a logistic layer -- let's say we have 200 classes\n predictions = Dense(num_classes, activation='softmax')(x)\n\n # this is the model we will train\n model = Model(inputs=base_model.input, outputs=predictions)\n\n # first: train only the top layers (which were randomly initialized)\n # i.e. freeze all convolutional InceptionV3 layers\n for layer in base_model.layers:\n layer.trainable = False\n\n # compile the model (should be done *after* setting layers to non-trainable)\n model.compile(optimizer='rmsprop',\n loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n\n return model\n\n#%%\n\ndef grouper(n, iterable):\n '''\n Given an iterable, it'll return size n chunks per iteration.\n Handles the last chunk too.\n '''\n it = iter(iterable)\n while True:\n chunk = tuple(itertools.islice(it, n))\n if not chunk:\n return\n yield chunk\n\n\nclass threadsafe_iter:\n \"\"\"\n Takes an iterator/generator and makes it thread-safe by\n serializing call to the `next` method of given iterator/generator.\n \"\"\"\n\n def __init__(self, it):\n self.it = it\n self.lock = threading.Lock()\n\n def __iter__(self):\n return self\n\n def __next__(self):\n with self.lock:\n return self.it.__next__()\n\n\ndef threadsafe_generator(f):\n \"\"\"\n A decorator that takes a generator function and makes it thread-safe.\n \"\"\"\n def g(*a, **kw):\n return threadsafe_iter(f(*a, **kw))\n return g\n\n\n@threadsafe_generator\ndef get_features_label(documents, batch_size=32, return_labels=True):\n '''\n Given a document return X, y\n\n X is scaled to [0, 1] and consists of all images contained in document.\n y is given an integer encoding.\n '''\n\n for batch in grouper(batch_size, documents):\n images = []\n labels = []\n\n for document in batch:\n category = document.get('category_id', '')\n img = document.get('imgs')[0]\n data = io.BytesIO(img.get('picture', None))\n im = imread(data)\n\n if category:\n label = labelencoder.transform([category])\n else:\n label = None\n\n im = im.astype('float32') / 255.0\n\n images.append(im)\n labels.append(label)\n\n if return_labels:\n yield np.array(images), np.array(labels)\n else:\n yield np.array(images)\n\n#%%\n\nif os.path.isfile('labelencoder.pkl'):\n with open('labelencoder.pkl', 'rb') as f:\n labelencoder = pickle.load(f)\n categories = pd.read_csv('categories.csv')\n\nelse:\n # Get the category ID for each document in the training set.\n documents = bson.decode_file_iter(open('../input/train.bson', 'rb'))\n categories = [(d['_id'], d['category_id']) for d in documents]\n categories = pd.DataFrame(categories, columns=['id', 'cat'])\n\n # Create a label encoder for all the labels found\n labelencoder = LabelEncoder()\n labelencoder.fit(categories.cat.unique().ravel())\n\n with open('labelencoder.pkl', 'wb') as f:\n pickle.dump(labelencoder, f)\n\n categories.to_csv('categories.csv')\n\n#%%\n\n# load the previous model\n\ntry:\n inception = keras.models.load_model('inceptionv3-finetune.h5')\nexcept:\n inception = create_model(num_classes=len(labelencoder.classes_))", "original_comment": "# So we can look at the progress on Tensorboard\n", "target_code": "import time\n\ncallback = keras.callbacks.TensorBoard(\n log_dir='./logs/inception/2/{}'.format(time.time())\n)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "logdir = os.path.join(os.getcwd(), 'logs')\ntensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)\n", "model": "natural", "intent": "# Enable callback to be able to look at the progress on Tensorboard"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n#%%\n\nimport seaborn as sns", "original_comment": "# This makes a white background with grid lines\n", "target_code": "import seaborn as sns\n\nsns.set_style(\"whitegrid\")\n", "project_metadata": {"full_name": "emonson/pandas-datamatters", "description": "Python for Tabular Data and Visualization \u2013 Data Matters 2020", "topics": [], "git_url": "git://github.com/emonson/pandas-datamatters.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-12-02T18:35:22Z", "size": 5862, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1794056}, "last_updated": "2021-01-05T16:21:04Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "df.head()\n", "model": "no-comments", "intent": "# Make a white background with grid lines"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\nsavelist # list of lists\n\n\n# we can check the items of the list we created\nsavelist[1]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n#%%\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n#%%\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n#%%\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n#%%\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n#%%\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n#%%\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n#%%\n\nsavelist # list of lists\n\n#%%\n\n# we can check the items of the list we created\nsavelist[1]\n\n#%%", "original_comment": "# access items from a list\n", "target_code": "savelist[0][1]\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "savelist.get(1)\nsavelist.get(2)\n", "model": "docstring", "intent": "# access items from a list"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n#%%\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n#%%\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n#%%\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n#%%\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n#%%\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n#%%\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n#%%\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n#%%\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n#%%\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224", "original_comment": "# Loading in the data.\n", "target_code": "data = ImageClassifierData.from_paths(\n PATH, tfms=tfms_from_model(arch, sz), bs=64)\n", "project_metadata": {"full_name": "zachmonge/fish_computer_vision_example", "description": "This is the repository corresponding to my Medium blog post titled \"Does Deep Learning Really Require 'Big Data'? --No!\"", "topics": [], "git_url": "git://github.com/zachmonge/fish_computer_vision_example.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2018-08-20T03:51:12Z", "size": 8148, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1391423}, "last_updated": "2020-02-13T19:27:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "arch = resnet34\nsz = 224\n", "model": "no-comments", "intent": "# Loading in the data."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n\ntype(raw_text)\n\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n\nraw_text[:100]\n\n\n# ## Tokenization\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Exploring the NLTK Book (Chapter 3)\n# [NLTK Book](https://www.nltk.org/book/)\n#\n# Resources:\n# * [urllib](https://docs.python.org/3/library/urllib.html)
Python package for working with urls.\n# * [Regular Expression module](https://docs.python.org/3/library/re.html)
allows us to [use regular expressions in python](https://docs.python.org/3/howto/regex.html#regex-howto) strings\n# * [Data pretty printer](https://docs.python.org/3/library/pprint.html)
print data structures in a readable format\n# * [Project Guttenberg catalog](http://www.gutenberg.org/catalog/)
find electronice texts from Project Guttenberg's collection that are not inlcuded in NLTK.\n# * [textfiles.com](http://www.textfiles.com/directory.html)
A usefule source for finding plain text files.\n# * [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
A Python library that helps us work with HTML and XML\n\n#%%\n\nfrom nltk import book\nimport os\nfrom bs4 import BeautifulSoup as bs\nfrom nltk import FreqDist\nimport nltk\nimport re\nimport pprint\nfrom nltk import word_tokenize\nfrom urllib import request\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Getting the text\n# Find a text from the Project Guttenberg colleciton or from textfile.com using urllib. You should browse the website to get the url you need.\n\n#%%\n\nurl = 'http://www.gutenberg.org/cache/epub/7178/pg7178.txt'\nresponse = request.urlopen(url)\nraw_text = response.read().decode('utf8')\n\n\n# We just retrieved the text for Marcel Proust's 'Swann's Way' from the Project Guttenberg catalog and turned into plain text (i.e. a string)\n#\n\n#%%\n\ntype(raw_text)\n\n#%%\n\n# this will tell us how many characters (not words) long the text is.\n# In order to get a word count we need to do some processing to this text.\n\nlen(raw_text)\n\n#%%\n\nraw_text[:100]\n\n\n# ## Tokenization", "original_comment": "# Turning the text into words using the nltk word_tokenizer\n", "target_code": "from nltk import word_tokenize\n\nwords_text = word_tokenize(raw_text)\n", "project_metadata": {"full_name": "derekjjackson/DH_PythonLibraries_JupyterNotebooks", "description": "FIles and resources for using Data Science, Python, and Jupyter Notebooks in the practice of Digital Humanities", "topics": [], "git_url": "git://github.com/derekjjackson/DH_PythonLibraries_JupyterNotebooks.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2018-10-20T15:06:33Z", "size": 29200, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 10076299}, "last_updated": "2020-12-25T21:05:12Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "nltk.word_tokenize(raw_text)\n", "model": "docstring", "intent": "# Turning the text into words using the nltk word_tokenizer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n\ndata.dtypes\n\n\ndata.size\n\n\ndata.info()\n\n\ndata['RAM'].describe()\n\n\ndata.describe()\n\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n\ndf.info()\n\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n\nlen(df)\n\n\ndf.fillna(np.nan)\ndf\n\n\ndf_dropped = df.dropna()\ndf_dropped\n\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n\ndf.head(2)\n\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n\n# checking unique values\ndfm['GPRS'].unique()\n\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n\n# checking data types\ndfm.dtypes\n\n\n# count of every column\ndfm.count()\n\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n\ndf1 = df_dropped\nlen(df1)\n\n\n# checking the datatypes\ndf1.dtypes\n\n\n# displaying info\ndf1.info()\n\n\ndf1.head()\n\n\ndf1.tail()\n\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n#%%\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n#%%\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n#%%\n\ndata.dtypes\n\n#%%\n\ndata.size\n\n#%%\n\ndata.info()\n\n#%%\n\ndata['RAM'].describe()\n\n#%%\n\ndata.describe()\n\n#%%\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n#%%\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n#%%\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n#%%\n\nlen(df)\n\n#%%\n\ndf.fillna(np.nan)\ndf\n\n#%%\n\ndf_dropped = df.dropna()\ndf_dropped\n\n#%%\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n#%%\n\ndf.head(2)\n\n#%%\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n#%%\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking unique values\ndfm['GPRS'].unique()\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n#%%\n\n# count of every column\ndfm.count()\n\n#%%\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n#%%\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n#%%\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n#%%\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n#%%\n\ndf1 = df_dropped\nlen(df1)\n\n#%%\n\n# checking the datatypes\ndf1.dtypes\n\n#%%\n\n# displaying info\ndf1.info()\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf1.tail()\n\n#%%\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n#%%\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n#%%", "original_comment": "# changing the datatype\n", "target_code": "df1[['display_size']] = df1[['display_size']].apply(pd.to_numeric)\n", "project_metadata": {"full_name": "yatinagg/Mobile_Price_Classification", "description": "Dataritz Phone Price Classification", "topics": [], "git_url": "git://github.com/yatinagg/Mobile_Price_Classification.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-05-18T15:04:18Z", "size": 6525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4727409, "Python": 568}, "last_updated": "2020-08-30T08:37:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df2 = df1.copy()\ndf2['GPRS'] = df2['GPRS'].astype(int)\ndf2['EDGE'] = df2['EDGE'].astype(int)\ndf2['3G_bands'] = df2['3G_bands'].astype(int)\ndf2['4G_bands'] = df2['4G_bands'].astype(int)\n", "model": "no-comments", "intent": "# changing the datatype"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n\ntrain_df.head()\n\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n#%%\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n#%%\n\ntrain_df.head()\n\n#%%\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n#%%\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)", "original_comment": "# To cast data into float32 type\n", "target_code": "train_data = train_data.astype('float32')\n", "project_metadata": {"full_name": "aditya2000/MNIST-Fashion-", "description": null, "topics": [], "git_url": "git://github.com/aditya2000/MNIST-Fashion-.git", "stars": 3, "watchers": 3, "forks": 3, "created": "2019-07-10T10:06:01Z", "size": 40, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 76938}, "last_updated": "2020-09-28T23:05:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train_data = train_data.astype('float32')\ntest_data = test_data.astype('float32')\n", "model": "natural", "intent": "# To cast data into float32 type"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n\ncelldom.get_repo_dir()\n\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n#%%\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n#%%\n\ncelldom.get_repo_dir()\n\n#%%\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n#%%\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n#%%\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n#%%\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n#%%\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n#%%", "original_comment": "# Test that the path can be parsed successfully\n", "target_code": "exp_config.parse_path(test_path)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "get_ipython().run_cell_magic('capture', '',\n '%%bash\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo pipefail\\n\\n#set -euo\n", "model": "no-comments", "intent": "# Test that the path can be parsed successfully"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n#%%\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n#%%\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n#%%\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n#%%\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n#%%\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n#%%\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n#%%\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n#%%\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n#%%\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n#%%\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n#%%\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n#%%", "original_comment": "# Reload with new savename\n", "target_code": "learn = create_learner(data, 'stage2-bestmodel')\nlearn.load('stage1-bestmodel')\n", "project_metadata": {"full_name": "URBNOpenSource/custom-vision-study", "description": null, "topics": [], "git_url": "git://github.com/URBNOpenSource/custom-vision-study.git", "stars": 5, "watchers": 5, "forks": 4, "created": "2019-03-12T20:31:02Z", "size": 19785, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5009642, "Python": 5509, "Shell": 928}, "last_updated": "2019-10-24T13:27:26Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "learn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n", "model": "natural", "intent": "# Reload with new savename"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n#%%\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n#%%\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n#%%\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)", "original_comment": " # extract dimensions\n", "target_code": " original_image_height, original_image_width = gray_img.shape\n", "project_metadata": {"full_name": "howarder3/ironman2020_OpenCV_photoshop", "description": null, "topics": [], "git_url": "git://github.com/howarder3/ironman2020_OpenCV_photoshop.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-12T15:55:03Z", "size": 125635, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 131231786}, "last_updated": "2020-12-23T03:20:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "img = cv2.resize(cv2.imread(\n '../input/opencv-samples-images/opencv_logo_black_and_white.png'))\nshow_img(img)\n", "model": "docstring", "intent": " # extract dimensions"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\n\n\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n\ncorpus\n\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n\npred\n\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n#%%\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n#%%\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n#%%\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n#%%\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n#%%\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\n\n\n#%%\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n#%%\n\ncorpus\n\n#%%\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n#%%\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n#%%\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n#%%\n\npred\n\n#%%\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n#%%\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values", "original_comment": " # as usual, we use text-embeddings\n", "target_code": " text = text_embedding(text)\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "sentiment_dataset[0]\n", "model": "natural", "intent": " # use text-embeddings"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]\n\n\n\ndef print_hw_parameters(model):\n alpha, beta, gamma = model.params['smoothing_level'], model.params[\n 'smoothing_slope'], model.params['smoothing_seasonal']\n print(\"Holt-Winters parameters:\")\n print(\"Alpha: \", alpha)\n print(\"Beta: \", beta)\n print(\"Gamma: \", gamma)\n\n\nprint(\"Forecasting started...\")\nstart_time = time.time()\n\ntry:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n#%%\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n#%%\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n#%%\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n#%%\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n#%%\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n#%%\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]", "original_comment": "# ### Fit and predict Time Serie\n", "target_code": " model = hw.ExponentialSmoothing(\n ts_train, seasonal='additive', seasonal_periods=train_data_length-1).fit()\n predictions = model.predict(start=ts_test.index[0], end=ts_test.index[-1])\n", "project_metadata": {"full_name": "CSIRT-MU/QoSForecastLSTM", "description": "An evaluation of QoS forecast methods described in paper Quality of Service Forecasting with LSTM Neural Networks", "topics": ["publication"], "git_url": "git://github.com/CSIRT-MU/QoSForecastLSTM.git", "stars": 4, "watchers": 4, "forks": 2, "created": "2018-09-05T07:37:36Z", "size": 10237, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16021131}, "last_updated": "2020-03-27T12:49:41Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model = sm.tsa.statespace.SARIMAX(ts_train, order=(\n 1, 1, 1), seasonal_order=(0, 1, 1, 12), enforce_stationarity=False, enforce_invertibility=False)\nresults = model.fit()\nprint_hw_parameters(results)\nprint(\"--- %s seconds ---\" % (time.time() - start_time))\n", "model": "natural", "intent": " # Fit and predict Time Serie"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n#%%\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n#%%\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n#%%\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n#%%\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n#%%\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n#%%\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n#%%\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed", "original_comment": "# Write to disk\n", "target_code": "df_Diffusion.to_csv(os.path.join(data_subdirectory, 'df_Diffusion.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_Diffusion.to_csv('data/df_Diffusion.csv')\n", "model": "no-comments", "intent": "# Write to disk"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n\n#%%\n\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Gr\u00e1ficos categ\u00f3ricos\n\n# ### Bar Plot:\n\n#%%\n\ndf = sns.load_dataset(\"iris\")\ndf.head()\n\n#%%\n\nfig, axes = plt.subplots(figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"],\n palette=\"pastel\", data=df, ax=axes, estimator=np.mean)\n\n#%%\n\nfig, axes = plt.subplots(1, 2, figsize=(6, 4))\nsns.barplot(x=df[\"species\"], y=df[\"sepal_length\"], palette=\"Paired\",\n data=df, ax=axes[0], estimator=np.count_nonzero)\nsns.barplot(x=df[\"species\"], y=df[\"petal_length\"],\n palette=\"Set2\", data=df, ax=axes[1], estimator=np.mean)\n\n\n# ### Boxplot:\n\n#%%\n\nplanets = sns.load_dataset(\"planets\")\nplanets.head()\n\n#%%\n\nsns.set(style=\"ticks\", palette=\"muted\")\nax = sns.boxplot(x=\"distance\", y=\"method\", data=planets)\nax.set_xscale(\"log\")\n\n\n# ### Violin Plot:\n\n#%%\n\ntips = sns.load_dataset(\"tips\")\ntips.head()\n\n#%%\n\nsns.set(style=\"whitegrid\")\nsns.violinplot(x=\"time\", y=\"total_bill\", data=tips, palette=\"rainbow\")\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", data=tips,\n palette=\"rainbow\", hue='sex')\n\n\n# #### Grouped violinplots with split violins\n\n#%%\n\nsns.violinplot(x=\"day\", y=\"total_bill\", hue=\"sex\", data=tips, split=True, inner=\"quart\",\n palette={\"Male\": \"#33FFF8\", \"Female\": \"#FDFF33\"})\n\n\n# ### Scatter Plot Matrix: Pairplot () function\n\n#%%\n\nsns.pairplot(df, hue=\"species\", palette='cubehelix')\n\n\n# ### Cat Plot: funci\u00f3n general de generaci\u00f3n de gr\u00e1ficos en seaborn\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.catplot(\"day\", \"total_bill\", \"sex\", data=tips,\n kind=\"box\", palette='cubehelix')\ng.set_axis_labels(\"Day\", \"Total Bill\")\n\n\n# ## Facet Grid\n\n#%%\n\nsns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"time\", row=\"smoker\")\ng = g.map(plt.hist, \"total_bill\", color='red')\n\n\n# ### Change the size and aspect ratio of each facet:\n\n#%%\n\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.hist, \"total_bill\", color=\"green\")\n\n\n# ### Setting the color palette:\n\n#%%\n\nkws = dict(s=40, linewidth=.5, edgecolor=\"w\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\",\n palette=\"Set2\", hue_order=[\"Dinner\", \"Lunch\"])\ng = g.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()", "original_comment": "# ### Use a different marker for the hue levels:\n", "target_code": "palette = dict(Lunch=\"blue\", Dinner=\"red\")\ng = sns.FacetGrid(tips, col=\"sex\", hue=\"time\", palette=palette,\n hue_order=[\"Dinner\", \"Lunch\"],\n hue_kws=dict(marker=[\"^\", \"v\"]))\n", "project_metadata": {"full_name": "scidatmath2020/Ciencia-de-datos-con-Python", "description": null, "topics": [], "git_url": "git://github.com/scidatmath2020/Ciencia-de-datos-con-Python.git", "stars": 20, "watchers": 20, "forks": 27, "created": "2020-09-07T20:49:59Z", "size": 20544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5705341, "Python": 12821}, "last_updated": "2020-11-19T22:06:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.set(style=\"ticks\")\ng = sns.FacetGrid(tips, col=\"smoker\", col_order=[\n \"Yes\", \"No\"], height=4, aspect=1)\ng.map(plt.scatter, \"total_bill\", \"tip\", **kws).add_legend()\n", "model": "natural", "intent": "# Use a different marker for the hue levels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n\ncr3.sort_values('index', inplace=True)\n\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n#%%\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n#%%\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n#%%\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n#%%\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n#%%\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n#%%\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n#%%\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n#%%\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n#%%\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n#%%\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n#%%\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n#%%\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n#%%\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n#%%\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n#%%\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n#%%\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n#%%\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n#%%\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n#%%\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n#%%\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n#%%\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n#%%\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n#%%\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n#%%\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n#%%\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n#%%\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n#%%\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n#%%\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n#%%\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n#%%\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n#%%\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n#%%\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n#%%\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n#%%\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n#%%\n\ncr3.sort_values('index', inplace=True)\n\n#%%\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n#%%\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)", "original_comment": "# Cluster and plot\n", "target_code": "g = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='Reds')\nplt.show()\n", "project_metadata": {"full_name": "illdopejake/Hippocampus_AP_Axis", "description": "Code used for Hippocampus Anterior/Posterior gene expression and neuroimaging analyses ", "topics": [], "git_url": "git://github.com/illdopejake/Hippocampus_AP_Axis.git", "stars": 7, "watchers": 7, "forks": 1, "created": "2018-05-20T18:18:47Z", "size": 149297, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 20340748, "Python": 58444, "Shell": 2454}, "last_updated": "2020-12-20T09:17:56Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "cluster = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=3,\n cluster_selection_method='leaf',\n metric='euclidean',\n metric_params=None,\n prefer_connectivity=True,\n min_cluster_size=3,\n prefer_dendrogram=True,\n no_rotation=True,\n convergence_iter=20,\n no_jobs=None,\n algorithm='auto',\n predict_labels=True,\n copy_x\n", "model": "docstring", "intent": "# Cluster and plot"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n#%%\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n#%%\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n#%%\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n#%%\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n#%%\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n#%%\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n#%%\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n#%%\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n#%%\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n#%%\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n#%%\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n#%%\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n#%%\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n#%%\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n#%%\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n#%%\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)", "original_comment": "# normalize all numeric attributes to the range [0,1]\n", "target_code": "ori_dataset_num_processed = (\n numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "ori_dataset_norm = (ori_dataset - ori_dataset.min()) / \\\n (ori_dataset.max() - ori_dataset.min())\n", "model": "docstring", "intent": "# normalize all numeric attributes to the range [0,1]"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n#%%\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n#%%", "original_comment": "# Download the raw-data files\n", "target_code": "get_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1gkgUlkaRXUzrNR_jY42ieK4xtLX3ztKX'\")\nget_ipython().system(\n \"unzip 'raw_data_files.zip' -d '/content/drive/My Drive/ICDMAI_Tutorial/'\")\n", "project_metadata": {"full_name": "amitbcp/icdmai_2020", "description": "This repository is for the Session held in International Conference on Data Management, Analytics and Innovation, New Delhi 2020", "topics": ["deeplearning", "recurrent-neural-networks", "rnn-pytorch", "word-embeddings", "text-classification", "rnns", "notebooks", "stackoverflow", "tag-recommender", "recommendation-system", "svm", "onevsrest"], "git_url": "git://github.com/amitbcp/icdmai_2020.git", "stars": 7, "watchers": 7, "forks": 2, "created": "2020-01-04T04:42:01Z", "size": 13078, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2676004}, "last_updated": "2021-01-06T14:44:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "zip_ref = zipfile.ZipFile(\n '/content/drive/My Drive/ICDMAI_Tutorial/notebook/ICDMAI_Tutorial.zip', 'r')\nzip_ref.extractall('/content')\nzip_ref.close()\n", "model": "docstring", "intent": "# Download the raw-data files"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Fire Up\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LinearRegression\nfrom collections import Counter\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.decomposition import FastICA\nfrom sklearn.decomposition import PCA\nimport xgboost as xgb\nimport numpy as np\nimport pandas as pd\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\ntrain = pd.read_csv('train_adjusted.csv')\ntest = pd.read_csv('test_c.csv')\n\n\n# ## Convert Categorical Into Numerical\n\n#%%\n\ncol = list(test.columns)[2:]\ncat = []\nfor each in col:\n if train[each].dtype == 'object' or 'ID' in each:\n train[each] = pd.factorize(train[each], sort=True)[0]\n test[each] = pd.factorize(test[each], sort=True)[0]\n cat.append(each)\n\n\n# ## Cleansing\n\n#%%\n\nbad_index = train[train.life_sq > train.full_sq].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq > test.full_sq].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.life_sq < 5].index\ntrain.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = test[test.life_sq < 5].index\ntest.ix[bad_index, \"life_sq\"] = np.NaN\nbad_index = train[train.full_sq < 5].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[test.full_sq < 5].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.kitch_sq >= train.life_sq].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[test.kitch_sq >= test.life_sq].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.kitch_sq == 0).values +\n (train.kitch_sq == 1).values].index\ntrain.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = test[(test.kitch_sq == 0).values +\n (test.kitch_sq == 1).values].index\ntest.ix[bad_index, \"kitch_sq\"] = np.NaN\nbad_index = train[(train.full_sq > 210) & (\n train.life_sq / train.full_sq < 0.3)].index\ntrain.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = test[(test.full_sq > 150) & (\n test.life_sq / test.full_sq < 0.3)].index\ntest.ix[bad_index, \"full_sq\"] = np.NaN\nbad_index = train[train.life_sq > 300].index\ntrain.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = test[test.life_sq > 200].index\ntest.ix[bad_index, [\"life_sq\", \"full_sq\"]] = np.NaN\nbad_index = train[train.build_year < 1500].index\ntrain.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = test[test.build_year < 1500].index\ntest.ix[bad_index, \"build_year\"] = np.NaN\nbad_index = train[train.num_room == 0].index\ntrain.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = test[test.num_room == 0].index\ntest.ix[bad_index, \"num_room\"] = np.NaN\nbad_index = train[(train.floor == 0).values *\n (train.max_floor == 0).values].index\ntrain.ix[bad_index, [\"max_floor\", \"floor\"]] = np.NaN\nbad_index = train[train.floor == 0].index\ntrain.ix[bad_index, \"floor\"] = np.NaN\nbad_index = train[train.max_floor == 0].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.max_floor == 0].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.floor > train.max_floor].index\ntrain.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = test[test.floor > test.max_floor].index\ntest.ix[bad_index, \"max_floor\"] = np.NaN\nbad_index = train[train.state == 33].index\ntrain.ix[bad_index, \"state\"] = np.NaN\n\n\n# ## Extra Feature Addition\n\n#%%\n\n# Add month-year\ntrain['timestamp'] = pd.to_datetime(train['timestamp'])\nmonth_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntrain['month_year_cnt'] = month_year.map(month_year_cnt_map)\ntest['timestamp'] = pd.to_datetime(test['timestamp'])\nmonth_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)\nmonth_year_cnt_map = month_year.value_counts().to_dict()\ntest['month_year_cnt'] = month_year.map(month_year_cnt_map)\n# Add week-year count\nweek_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntrain['week_year_cnt'] = week_year.map(week_year_cnt_map)\nweek_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)\nweek_year_cnt_map = week_year.value_counts().to_dict()\ntest['week_year_cnt'] = week_year.map(week_year_cnt_map)\n# Add month and day-of-week\ntrain['month'] = train.timestamp.dt.month\ntrain['dow'] = train.timestamp.dt.dayofweek\ntest['month'] = test.timestamp.dt.month\ntest['dow'] = test.timestamp.dt.dayofweek\n# Other feature engineering\ntrain['rel_floor'] = train['floor'] / train['max_floor'].astype(float)\ntrain['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)\ntest['rel_floor'] = test['floor'] / test['max_floor'].astype(float)\ntest['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)\ntrain['room_size'] = train['life_sq'] / train['num_room'].astype(float)\ntest['room_size'] = test['life_sq'] / test['num_room'].astype(float)\n\n\n# ## Involve Macro Features\n\n#%%\n\nmacro_cols = ['timestamp', \"balance_trade\", \"balance_trade_growth\", \"eurrub\", \"average_provision_of_build_contract\",\n \"micex_rgbi_tr\", \"micex_cbi_tr\", \"deposits_rate\", \"mortgage_value\", \"mortgage_rate\",\n \"income_per_cap\", \"rent_price_4+room_bus\", \"museum_visitis_per_100_cap\", \"apartment_build\"]\nmacro = pd.read_csv('macro_c.csv')[macro_cols]\ntrain = train.merge(macro, how='left', on='timestamp')\ntest = test.merge(macro, how='left', on='timestamp')\n\n\n# ## Create PCA Features\n\n#%%\n\ntrain_fill = train.fillna(-999)\ntest_fill = test.fillna(-999)\nn_comp = 20\npca = PCA(n_components=n_comp, random_state=42)\npca_results_train = pca.fit_transform(train_fill[col])\npca_results_test = pca.transform(test_fill[col])\n\n\n# ## Create ICA Features\n\n#%%\n\nica = FastICA(n_components=n_comp, random_state=42)\nica_result_train = ica.fit_transform(train_fill[col])\nica_result_test = ica.transform(test_fill[col])\n\n\n# ## Put features in original dataset\n\n#%%\n\nfor i in range(1, n_comp + 1):\n train['pca_' + str(i)] = pca_results_train[:, i - 1]\n test['pca_' + str(i)] = pca_results_test[:, i - 1]\n train['ica_' + str(i)] = ica_result_train[:, i - 1]\n test['ica_' + str(i)] = ica_result_test[:, i - 1]\n\n#%%\n\nReducedVar = []\nfor each in list(test.columns):\n if 'pca' in each or 'ica' in each:\n ReducedVar.append(each)", "original_comment": "# ## Prepare a train/test set for Validating Stacking\n", "target_code": "from sklearn.model_selection import train_test_split\n\ntrain_ = train.fillna(-999)\ntest_ = test.fillna(-999)\ntraining, testing = train_test_split(train_, test_size=0.2, random_state=42)\n", "project_metadata": {"full_name": "liujiashen9307/KaggleCompetition", "description": "Code hub for the kaggle competitions I have participated in.", "topics": [], "git_url": "git://github.com/liujiashen9307/KaggleCompetition.git", "stars": 6, "watchers": 6, "forks": 10, "created": "2016-10-12T21:10:54Z", "size": 15258, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16811198, "HTML": 14162298, "Python": 1658600, "R": 8306}, "last_updated": "2020-02-01T03:33:11Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "X_train = train[train.columns[ReducedVar]]\nX_test = test[test.columns[ReducedVar]]\n", "model": "natural", "intent": "# Prepare a train/test set for Validating Stacking"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\ndrop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n#%%\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n#%%\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\n#%%", "original_comment": "# Feature selection: remove variables no longer containing relevant information\n", "target_code": "train = train.drop(drop_elements, axis=1)\ntest = test.drop(drop_elements, axis=1)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "coverage": "Disagree", "coverage-score": 1, "usefulness": "Agree", "usefulness-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "X = dataset.drop(drop_elements, axis=1)\ny = dataset['Survived']\n", "model": "docstring", "intent": "# Feature selection: remove variables no longer containing relevant information"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n\nfrom numpy.random import randint\nimport numpy as np\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Probability Distributions\n# Many standard statistical models operate under the assumption that your dataset takes on a certain distribution, such a Gaussian (normal). The probability distribution describes the array of all possible outcomes that a random variable can take, along with the probability of each possibility. Depending on the random variable and the situation, a number of different distributions are possible.\n#\n# ![winter](./winter.jpeg)\n#\n#\n# ## Discrete vs Continuous Random Variables\n# The methods we can use for calculating probability are determined by whether a variable is continuous or discrete.\n#\n# If we blindly pick a random variable X from our dataset, the possible values for X can be described as discrete if X can only take on a certain number of distinct values, such as in a coin flip where X can only equal heads or tails, and continuous if X can take on an infinite number of possible values, such as weight or height.\n#\n# ![prob_dist](./probabilty_dist.png)\n#\n# The diagram above shows the probablity densities for some of the most common distirubtions. The horizontal (X) axis in each box is the set of possible numeric outcomes. The vertical (Y) axis describes the probability of outcomes.\n#\n# Discrete distributions are mostly portrayed at the top of the chart, where the line height represents the probability of that outcome, while the bottom half represents continuous distributions as a curve, with each possible outcome falling somewhere in the area below the curve.\n#\n# ### Discrete Distributions\n# If X is discrete and can only take on a limited set of values, then we can calculate the probility that X is either heads or tails, called it's **probability mass function**. Examples of these types of distributions are Bernoulli, Binomial and Poisson.\n# - `disrete_X = [\"heads\", \"tails\"]`\n# - `P(X) = 1/2`\n#\n# ### Continuous Distributions\n# For a continuous X, the probability that X can belong to any particular range of values is known as it's **probability density function**. Rather than trying to determine the probability that X is equal to a specific value, we are interested in the probability that X falls in the range of real numbers.\n# - `continuous_X = \u221e`\n# - `P(X=x) = 0` Because X can take on any, or all, real values, then there is a set of infinite possible values\n#\n# Even though we cannot calculate the probability that X equals a particular random number, we can calculate the probability that a value, k, lies within the range of X, by calculating the probability density function for X, and determining if k falls in the area under the curve.\n# ![image.png](attachment:image.png)\n# Where:\n# - \u03bc (mu) is the population mean\n# - \u03c3 (sigma) is the standard deviation\n# - \u03c32 is the variance\n#\n# ![standard_dist](standard_dist.png)\n#\n# We can see above that on the y axis, any output value from a probability density function is greater than or equal to zero, and less than 1.\n\n# ## Standard Normal Distribution (aka Gaussian)\n# The most famous, and most widely used, probability distribution function is the Gaussian, or standard normal, distribution.\n#\n# A Gaussian distribution has the following properties:\n# - Mean = 0\n# - Standard deviation = 1\n#\n# ### Central Limit Theroum\n# One of the reasons this distribution is used as the gold standard is due to the Central Limit Theorum- with a large enough sample size, sample means become normally distributed (mean = 0, std dev = 1). This provides us with a baseline we can use to determine what is a reasonable expected range for the value of k.\n#\n# Basically, we have observed that no matter what kind of distribution you have, if you draw groups of random samples from that distribution, their means will be normally distributed. How cool is that?\n#\n# For example, if you randomly picked 10 people out of 100 and recorded their heights, the average of those 10 heights would be the sample mean. If you did this enough times, plotting the means as you go along, you would see that with each draw, the distribution becomes roughly normal.\n\n# ### CLT with dice\n# A standard die is a cube with six possible outcomes ranging from 1 to 6, with the probability of rolling any one number 1/6. The distribution of the numbers that turn up from a dice roll is uniform given the equal likelihood.\n#\n# We'll use numpy's randint() function to simulate 50 dice rolls and landing on a randome number between 1 and 6:\n\n#%%\n\nfrom numpy.random import randint\nimport numpy as np\nfrom matplotlib import pyplot as plt\n\n# generate a sample of die rolls\nrolls = randint(1, 7, 50)\nrolls\n\n\n# Now let's view the mean at 50 rolls:\n\n#%%\n\nprint(np.mean(rolls))\n\n\n# We know that the expected mean for this distribution can be calculated as the sum of each die divided by total possiblities:\n\n#%%\n\n(1 + 2 + 3 + 4 + 5 + 6) / 6\n\n\n# Which is different than our sampled mean, as expected, because it's estimating the population mean from our random draws.\n#\n# #### 1000 samples\n#\n# Let's grab 100 samples of 50 and see how that impacts our distribution and mean:\n\n#%%\n\nmeans_100 = [np.mean(randint(1, 7, 50)) for _ in range(100)]\n\n#%%", "original_comment": "# plot the distribution of sample means\n", "target_code": "from matplotlib import pyplot as plt\n\nplt.hist(means_100)\nplt.show()\n", "project_metadata": {"full_name": "summerela/python_data_analysis", "description": "Introduction to Data Analysis with Python for UW Foster School of Business", "topics": [], "git_url": "git://github.com/summerela/python_data_analysis.git", "stars": 11, "watchers": 11, "forks": 27, "created": "2019-06-08T02:35:32Z", "size": 7972, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 7883836}, "last_updated": "2020-11-09T16:54:13Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "compatibility": "Strongly agree", "compatibility-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3, "precision": "Strongly agree", "precision-score": 3}], "predicted_code": "sns.distplot(means_100)\n", "model": "docstring", "intent": "# plot the distribution of sample means"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n\nprint(spam.head())\n\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n\nprint(data_set[:5])\n\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n\nprint(training_set[:5])\n\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n#%%\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n#%%\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n#%%\n\nprint(spam.head())\n\n#%%\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n#%%\n\nprint(data_set[:5])\n\n#%%\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n#%%\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n#%%\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n#%%\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n#%%\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n#%%\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n#%%\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n#%%\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n#%%\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n#%%\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n#%%\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n#%%\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n#%%\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n#%%\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n#%%\n\nprint(training_set[:5])\n\n#%%\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n#%%", "original_comment": "# Training the classifier with NaiveBayes algorithm\n", "target_code": "spamClassifier = nltk.NaiveBayesClassifier.train(training_set)\n", "project_metadata": {"full_name": "beingdatum/NaturalLanguageProcessing", "description": null, "topics": [], "git_url": "git://github.com/beingdatum/NaturalLanguageProcessing.git", "stars": 3, "watchers": 3, "forks": 10, "created": "2020-01-01T13:54:22Z", "size": 23376, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2267856, "Python": 1378}, "last_updated": "2020-06-08T09:54:47Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", "model": "natural", "intent": "# Training the classifier with NaiveBayes algorithm"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n#%%\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n#%%\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n#%%\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()", "original_comment": "# #### Check Missing Value\n", "target_code": "for col in df_train.columns:\n if any(df_train[col].isnull()):\n print(\"feature %s, missing %i entries\" %\n (col, sum(df_train[col].isnull())))\n else:\n print(\"feature %s has no missing value\" % col)\n", "project_metadata": {"full_name": "shawlu95/Data-Science-Toolbox", "description": "Examples and illustration of basic statistic concepts, probability distribution, Monte Carlo simulation, preprocessing and visualization techniques, and statistical testing.", "topics": [], "git_url": "git://github.com/shawlu95/Data-Science-Toolbox.git", "stars": 28, "watchers": 28, "forks": 11, "created": "2019-03-25T19:58:55Z", "size": 157445, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 52401937, "Python": 36992, "TSQL": 3834, "PLpgSQL": 3609, "Shell": 3459, "R": 1437}, "last_updated": "2020-12-26T18:51:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "pd.isnull(df_train).sum()\n", "model": "docstring", "intent": "# Check Missing Value"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport plotly.graph_objs as go\nimport plotly\nfrom textblob import TextBlob\nimport scipy\nfrom gensim.models import KeyedVectors\nimport pickle\nimport pandas as pd\nfrom __future__ import division\nfrom numbers import Number\nimport sys\nimport codecs\nimport numpy as np\nimport sqlite3\nimport nltk\n\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Load data\n# There are 2 datasets I need:\n# * **bgg_ratings_comments_full_r1.db**: SQL db storing all of the reviews for 2000+ games (along with username, rating, game name, and game id) for each review.\n# * **bgg_gamelist.csv**: CSV file containing some meta-data for each of the games (one row per game).\n#\n# Ultimately I merge these two together and save it in a Dataframe called **df_allgames**.\n\n#%%\n\ndef import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name', 'n_ratings', 'pic_url']):\n \"\"\" \n Import the Board Game Meta data from csv file.\n Input: \n 1. meta_gamelist_filename: file name of game list data with meta data (e.g., gameID, etc)\n 2. col_names: what you want to name the columns of the dataset\n \"\"\"\n\n # Read in csv file with meta game info\n df_meta_gamelist = pd.read_csv(meta_gamelist_filename)\n df_meta_gamelist.columns = col_names # prettify column names\n\n return df_meta_gamelist\n\n\ndef import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Import data from database file that contains review data (acquired from BGG API).\n Default sql_query extracts all non-empty reviews that contain a rating >0 (this\n eliminates a number of rows that have 0s across all columns)\n \"\"\"\n connex = sqlite3.connect(df_filename) # Opens DB file\n cur = connex.cursor() # Establish communication with DB\n\n sql = sql_query + \";\" # SQL query\n df_reviews_and_ratings = pd.read_sql_query(\n sql, connex) # Read DB data into\n # Drop the duplicate rows\n df_reviews_and_ratings = df_reviews_and_ratings.drop_duplicates()\n\n connex.close() # close connection to db\n\n return df_reviews_and_ratings\n\n\ndef merge_meta_and_review_dfs(df_meta_gamelist, df_reviews_and_ratings):\n \"\"\"\n Merge meta gamelist df together with reviews df.\n \"\"\"\n # Merge game meta data with reviews\n df_merged = pd.merge(df_reviews_and_ratings, df_meta_gamelist,\n how='left', on='gameid')\n return df_merged\n\n\ndef import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\"):\n \"\"\"\n Implement all previous functions to import meta data and review data and merge\n them together. \n \"\"\"\n df_meta_gamelist = import_meta_gamelist(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=['gameid', 'name',\n 'n_ratings', 'pic_url'])\n df_reviews_and_ratings = import_reviews_from_db(db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n df_merged = merge_meta_and_review_dfs(df_meta_gamelist=df_meta_gamelist,\n df_reviews_and_ratings=df_reviews_and_ratings)\n\n return df_merged, df_meta_gamelist\n\n#%%\n\n# Pull review data from DB and merge with game meta data (df_allgames)\n# Also returnt the df_meta_gamelist as bgg_gamelist\ndf_allgames, bgg_gamelist = import_and_merge_datasets(meta_gamelist_filename='bgg_gamelist.csv',\n col_names=[\n 'gameid', 'name', 'n_ratings', 'pic_url'],\n db_filename=\"bgg_ratings_comments_full_r1.db\",\n sql_query=\"SELECT * FROM data WHERE value!='' AND rating>0\")\n\n#%%\n\ndf_allgames.head()\n\n\n# # Text Cleaning:\n# Remove board game names and Tokenize reviews\n\n# ### Step 1: Make text in reviews all lowercase, and remove almost all board game names from reviews\n# Sometimes reviewers mention the board game names in reviews (e.g., \"I first played *Splendor* with my friends in grad school.\"). I did not use any entity recognition techniques (like spacy), and therefore name dropping would only introduce noise into my average word vectors. To eliminate noise, I decided to filter (most of) the names out of the reviews. The exceptions included games that had names that can occur in other unrelated words (e.g., 'coup' is in the word 'couple').\n\n#%%\n\ndef cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames,\n raw_review_col_name='value',\n df_gamelist=bgg_gamelist,\n game_name_col_df_gamelist='name'):\n \"\"\"\n This function makes all text in reviews lowercase, and removes all\n board game names from review text (except for those on the \n nameExclude_list - see below). \n Input: All defaults\n Note: game_name_col_df_gamelist is the column within df_gamelist\n containing the names of the board games. 1 row per game in this df.\n Output: df_with_reviews df with a new column (rev_LC_noNames) \n containing the reviews \n \"\"\"\n\n # Make a list with all the board game names\n gameName_list = df_gamelist[game_name_col_df_gamelist].unique().tolist()\n gameName_list = [game.lower() for game in gameName_list] # make lower case\n # print(len(gameName_list)) # debugging\n\n # Problem names: Names that appear too often in real words.\n # Don't remove these from reviews.\n nameExclude_list = ['ys', 'go', 'coup', 'ra', 'goa', 'set', 'pit',\n 'fuse', 'roma', 'evo', 'aton', 'fits', 'frag',\n 'pairs', 'rage', 'edo', 'war', 'tak']\n #[token for token in tokens if token not in en_stop]\n gameName_list = [\n game for game in gameName_list if game not in nameExclude_list]\n # print(len(gameName_list)) # debugging\n\n # Remove all board game names from the reviews.\n\n # Make reviews all lower case and replace game names with nothing\n # make raw reviews lowercase and save to new column named 'comments'\n # Note: this doesn't take long to run.\n df_with_reviews['rev_LC_noNames'] = df_with_reviews[raw_review_col_name].apply(\n lambda x: x.lower())\n\n # Replace game names with \"\"\n # Note: This loop takes a while (because there are 2000+ games to loop through)\n for game in gameName_list:\n df_with_reviews['rev_LC_noNames'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: x.replace(game, \"\"))\n\n return df_with_reviews\n\n\ndef cleaning2_tokenize_words(df_with_reviews=df_allgames):\n '''\n Parse reviews into words. (Need to import nltk first.)\n Output: Adds a column (rev_cleaned) to df_with_reviews containing \n tokenized reviews.\n '''", "original_comment": " # Tokenize words in reviews:\n", "target_code": " import nltk\n\n df_with_reviews['rev_cleaned'] = df_with_reviews['rev_LC_noNames'].apply(\n lambda x: nltk.word_tokenize(x))\n", "project_metadata": {"full_name": "MeredithLevsen/InsightProject", "description": "GameOn - Quickly evaluate board games based on user reviews", "topics": [], "git_url": "git://github.com/MeredithLevsen/InsightProject.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2018-07-17T17:31:15Z", "size": 541, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1045270, "HTML": 265408}, "last_updated": "2018-12-04T03:47:10Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "cleaning1_lowercase_and_remove_bg_names(df_with_reviews=df_allgames)\ncleaning2_tokenize_words(df_with_reviews=df_allgames)\n", "model": "no-comments", "intent": " # Tokenize words in reviews:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n#%%\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n#%%\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n#%%\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n#%%\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n#%%\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n#%%\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n#%%\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n#%%\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n#%%\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n#%%\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n#%%\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n#%%\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n#%%\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n#%%\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n#%%\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n#%%\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n#%%\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n#%%\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n#%%\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n#%%\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n#%%\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n#%%\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n#%%\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n#%%\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n#%%\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])", "original_comment": "# Annotate plots\n", "target_code": "subplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\n", "project_metadata": {"full_name": "jpowie01/CIFAR10-HandsOn", "description": "Hands-on prepared for one of my presentations that took place on Computer Vision's mini-course at student's orgranization called \"Gradient\" (Gda\u0144sk University of Technology)", "topics": ["deep-learning", "convolutional-neural-networks", "cifar10", "jupyter-notebook", "hands-on"], "git_url": "git://github.com/jpowie01/CIFAR10-HandsOn.git", "stars": 6, "watchers": 6, "forks": 0, "created": "2018-01-03T21:22:35Z", "size": 9589, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1717141}, "last_updated": "2018-01-09T19:26:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "subplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n", "model": "natural", "intent": "# Annotate plots"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n\ndf = df_orig.copy()\n\n\ndf.info()\n\n\ndf.head(5)\n\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n\n# Use function to add in indicators for presence of null values\n\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n\nfloat(9.00000).is_integer()\n\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n\n\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n\n# ?np.where\n\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n\ndf.head(5)\n\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n\ndf['MasVnrType'].value_counts()\n\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n\ndf[(df.BsmtQual_missing == True)]\n\n\ndf.BsmtCond.value_counts()\n\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n\ndf.Electrical.value_counts()\n\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n\n# ## 2. Create additional bespoke data features\n\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n\ndf['Functional'].value_counts()\n\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n\ndf['PoolQC'].value_counts()\n\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n\ndf['Condition1'].value_counts()\n\n\ndf['Condition2'].value_counts()\n\n\ndf['Condition1']\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n\ndf.head(10)\n\n\n# ***\n# ## 4. Set up target encoding parameters\n\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_oe.head(5)\n\n\n# ***\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n#%%\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n#%%\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n#%%\n\ndf = df_orig.copy()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(5)\n\n#%%\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n#%%\n\n# Use function to add in indicators for presence of null values\n\n#%%\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n#%%\n\ndf = denote_null_values(df)\n\n#%%\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n#%%\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n#%%\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n#%%\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n#%%\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n#%%\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n#%%\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n#%%\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n#%%\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n#%%\n\nfloat(9.00000).is_integer()\n\n#%%\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n#%%\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n\n\n#%%\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n#%%\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n#%%\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n#%%\n\n# ?np.where\n\n#%%\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n#%%\n\ndf.head(5)\n\n#%%\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n#%%\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n#%%\n\ndf['MasVnrType'].value_counts()\n\n#%%\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n#%%\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n#%%\n\ndf[(df.BsmtQual_missing == True)]\n\n#%%\n\ndf.BsmtCond.value_counts()\n\n#%%\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n#%%\n\ndf.Electrical.value_counts()\n\n#%%\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n#%%\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n#%%\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n#%%\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n#%%\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n#%%\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n#%%\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n#%%\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n#%%\n\n# ## 2. Create additional bespoke data features\n\n#%%\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n#%%\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n#%%\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n#%%\n\ndf['Functional'].value_counts()\n\n#%%\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n#%%\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n#%%\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n#%%\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n#%%\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n#%%\n\ndf['PoolQC'].value_counts()\n\n#%%\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n#%%\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n#%%\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n#%%\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n#%%\n\ndf['Condition1'].value_counts()\n\n#%%\n\ndf['Condition2'].value_counts()\n\n#%%\n\ndf['Condition1']\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n#%%\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n#%%\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n#%%\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n#%%\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n#%%\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n#%%\n\ndf.head(10)\n\n#%%\n\n# ***\n# ## 4. Set up target encoding parameters\n\n#%%\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n#%%\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n#%%\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n#%%\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_oe.head(5)\n\n\n# ***", "original_comment": "# ## 5. Set up OneHot encoding parameters\n", "target_code": "onehot_enc = ce.OneHotEncoder(verbose=1, cols=[\n 'Street', 'Alley', 'CentralAir', 'MiscFeature'], use_cat_names=True)\nonehot_enc.get_params()\n", "project_metadata": {"full_name": "JonathanBechtel/DAT-10-19", "description": "GitHub Repo For DAT 10-19", "topics": [], "git_url": "git://github.com/JonathanBechtel/DAT-10-19.git", "stars": 2, "watchers": 2, "forks": 11, "created": "2020-10-19T14:53:15Z", "size": 108252, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 72671490, "HTML": 915086, "Python": 92446, "Shell": 222}, "last_updated": "2021-01-06T23:37:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "ordinal_enc = ce.OneHotEncoder(cols=ordenc_cols).fit(df)\nordinal_enc.get_params()\ndf_oe = ordinal_enc.transform(df.drop('SalePrice', axis=1))\ndf_oe.head(5)\n", "model": "docstring", "intent": "# 5. Set up OneHot encoding parameters"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n\ndata['sex'].unique()\n\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Weight of evidence\n#\n# Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default. That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the \"strength\u201d of a grouping technique to separate good and bad risk (default).\n#\n# - WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.\n# - If P(Bads) > P(Goods) the odds ratio will be < 1 and,\n# - WoE will be < 0 if, P(Goods) > P(Bads).\n#\n# WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.\n#\n# The WoE transformation has three advantages:\n#\n# - It creates a monotonic relationship between the target and the independent variables.\n# - It orders the categories on a \"logistic\" scale which is natural for logistic regression\n# - The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.\n#\n# The WoE also has a limitation:\n#\n# - Prone to cause over-fitting\n#\n#\n# For more details follow this [link](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview)\n#\n#\n# ## In this demo:\n#\n# We will see how to perform one hot encoding with:\n# - pandas\n# - Feature-Engine\n#\n# And the advantages and limitations of each implementation using the Titanic dataset.\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nimport matplotlib.pyplot as plt\n\n# to split the datasets\nfrom sklearn.model_selection import train_test_split\n\n# for encoding using feature-engine\nfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoder\n\n#%%\n\n# load dataset\n\ndata = pd.read_csv(\n '../titanic.csv',\n usecols=['cabin', 'sex', 'embarked', 'survived'])\n\ndata.head()\n\n#%%\n\n# let's remove observations with na in embarked\n\ndata.dropna(subset=['embarked'], inplace=True)\ndata.shape\n\n#%%\n\n# Now we extract the first letter of the cabin\n# to create a simpler variable for the demo\n\ndata['cabin'] = data['cabin'].astype(str).str[0]\n\n#%%\n\n# and we remove the observations where cabin = T\n# because only few observations show T in the dataset\n\ndata = data[data['cabin'] != 'T']\ndata.shape\n\n#%%\n\n# let's have a look at how many labels each variable has\n\nfor col in data.columns:\n print(col, ': ', len(data[col].unique()), ' labels')\n\n#%%\n\n# let's explore the unique categories\ndata['cabin'].unique()\n\n#%%\n\ndata['sex'].unique()\n\n#%%\n\ndata['embarked'].unique()\n\n\n# ### Encoding important\n#\n# We select calculate the woe using the train set, and then use those mappings in the test set.\n#\n# Note that in the pandas implementation, we need to keep the target in the training set\n\n#%%\n\n# let's separate into training and testing set\n\nX_train, X_test, y_train, y_test = train_test_split(\n # this time we keep the target!!\n data[['cabin', 'sex', 'embarked', 'survived']],\n data['survived'], # target\n test_size=0.3, # percentage of obs in test set\n random_state=0) # seed to ensure reproducibility\n\nX_train.shape, X_test.shape\n\n\n# ### Explore original relationship between categorical variables and target\n\n#%%\n\n# let's explore the relationship of the categories with the target\n\nfor var in ['cabin', 'sex', 'embarked']:\n\n fig = plt.figure()\n fig = X_train.groupby([var])['survived'].mean().plot()\n fig.set_title('Relationship between {} and Survival'.format(var))\n fig.set_ylabel('Mean Survival')\n plt.show()\n\n\n# You can see that the relationship between the target and cabin and embarked goes up and down, depending on the category.\n#\n#\n# ## Weight of Evidence with pandas\n#\n#\n# ### Advantages\n#\n# - quick\n# - returns pandas dataframe\n#\n# ### Limitations of pandas:\n#\n# - it does not preserve information from train data to propagate to test data\n#\n# We need to store the mappings separately if planning to use the model in production.\n\n#%%\n\n# let's calculate the probability of survived = 1 per category\n# (probability of events or p(1))\n\nprob_df = X_train.groupby(['cabin'])['survived'].mean()", "original_comment": "# and capture it into a dataframe\n", "target_code": "prob_df = pd.DataFrame(prob_df)\n", "project_metadata": {"full_name": "mohsin-ashraf/personal-msds", "description": "Repository for personal MSDS", "topics": [], "git_url": "git://github.com/mohsin-ashraf/personal-msds.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-03-26T06:57:19Z", "size": 20354, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 21670112, "Python": 33451}, "last_updated": "2020-09-18T15:36:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "prob_df = X_train.groupby(['cabin'])['survived'].mean()\nprob_df = pd.DataFrame(prob_df)\nprob_df = prob_df.reset_index()\nprob_df = prob_df.rename(columns={0: 'prob'})\nprob_df.head()\n", "model": "docstring", "intent": "# capture it into a dataframe"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n\nleads_scoring.head(5)\n\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n\nround(leads_scoring.describe(), 2)\n\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n\nleads_scoring_model.info()\n\n\nleads_scoring_model.shape\n\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n\n\n\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n\n\n\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n\nX_train.describe()\n\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n\n\n\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n\nlogreg = LogisticRegression()\n\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n\n\n\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n\ncols = X_train.columns[rfe.support_]\n\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n\ncols\n\n\nX_train.shape\n\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n\n\n\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n#%%\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n#%%\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n#%%\n\nleads_scoring.head(5)\n\n#%%\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n#%%\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n#%%\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n#%%\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n#%%\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n#%%\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n#%%\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n#%%\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n#%%\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n#%%\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n#%%\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n#%%\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n#%%\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n#%%\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n#%%\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n#%%\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n#%%\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n#%%\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n#%%\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n#%%\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n#%%\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n#%%\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n#%%\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n#%%\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n#%%\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n#%%\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n#%%\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n#%%\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n#%%\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n#%%\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n#%%\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n#%%\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n#%%\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n#%%\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n#%%\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n#%%\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n#%%\n\nleads_scoring_model.info()\n\n#%%\n\nleads_scoring_model.shape\n\n#%%\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n#%%\n\n\n\n#%%\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n#%%\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n#%%\n\n\n\n#%%\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n#%%\n\nX_train.describe()\n\n#%%\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n#%%\n\n\n\n#%%\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n#%%\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n#%%\n\nlogreg = LogisticRegression()\n\n#%%\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n#%%\n\n\n\n#%%\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n#%%\n\ncols = X_train.columns[rfe.support_]\n\n#%%\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n#%%\n\ncols\n\n#%%\n\nX_train.shape\n\n#%%\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n#%%\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n#%%\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n#%%\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n#%%\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n#%%\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n#%%\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n#%%\n\n\n\n#%%\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n#%%", "original_comment": "# Let's check the overall accuracy.\n", "target_code": "print(metrics.accuracy_score(y_train_pred_final.Converted,\n y_train_pred_final.Predicted_Conversion))\n", "project_metadata": {"full_name": "saad1504/Upgrad_DataScience_Projects", "description": "All Data Science projects completed for PGPDS by Upgrad", "topics": [], "git_url": "git://github.com/saad1504/Upgrad_DataScience_Projects.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2019-10-14T16:57:22Z", "size": 29931, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6008971, "PLSQL": 11605}, "last_updated": "2020-10-12T22:18:23Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1}, {"completed_by": {"id": 2}, "compatibility": "Strongly agree", "compatibility-score": 3, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0}], "predicted_code": "y_train_pred_final.head()\n", "model": "no-comments", "intent": "# Let's check the overall accuracy."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n#%%\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n#%%\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n#%%\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n#%%\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n#%%\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n#%%\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n#%%\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n#%%\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n#%%\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n#%%\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n#%%\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n#%%\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n#%%\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n#%%\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n#%%\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n#%%\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n#%%\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#", "original_comment": "# Function to scale input data (`X`) for predictive purposes.\n", "target_code": "from sklearn.preprocessing import StandardScaler\n\ndef dataset_scaler(training_data, testing_data, obj=StandardScaler):\n \"\"\" Function to scale X-data using custom input algorithm. \"\"\"\n SCALED_FEATURES = [feature + \"_sca\" for feature in training_data]\n scaler = obj()\n scaler.fit(training_data)\n X_train_sca = pd.DataFrame(scaler.transform(\n training_data).T, SCALED_FEATURES).T\n X_test_sca = pd.DataFrame(scaler.transform(\n testing_data).T, SCALED_FEATURES).T\n return X_train_sca, X_test_sca\n", "project_metadata": {"full_name": "AakashSudhakar/6nomads-interview-project", "description": "Interview project repository for data analysis and prediction for 6Nomads data. ", "topics": ["data-analysis", "data-processing", "data-science", "machine-learning", "data-structures"], "git_url": "git://github.com/AakashSudhakar/6nomads-interview-project.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2019-09-06T05:04:40Z", "size": 385, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 545554, "Python": 21164}, "last_updated": "2020-05-13T23:33:12Z"}, "annotations": [{"completed_by": {"id": 1}, "compatibility": "Disagree", "compatibility-score": 1, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "scaler = preprocessing.StandardScaler().fit(X_train)\nX_train_scaled = scaler.transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n", "model": "docstring", "intent": "# Function to scale input data (`X`) for predictive purposes."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n#%%\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n#%%\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n#%%\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n#%%\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n#%%\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n#%%\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n#%%\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n#%%\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n#%%\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n#%%\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n#%%\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n#%%\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n#%%\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n#%%\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n#%%\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n#%%\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n#%%\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n#%%\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n#%%\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n#%%\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n#%%\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n#%%\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n#%%\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n#%%\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n#%%\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n#%%\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n#%%\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n#%%\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n#%%\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n#%%\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n#%%\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n#%%\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n#%%\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n#%%\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n#%%\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n#%%\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n#%%\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)", "original_comment": "# set axis labels\n", "target_code": "plt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.axis('off')\nfig = plt.figure()\nax = fig.add_subplot(111)\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].\n", "model": "docstring", "intent": "# set axis labels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n#%%\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n#%%\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n#%%\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n#%%\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n#%%\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n#%%\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n#%%\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n#%%\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n#%%\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n#%%\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n#%%\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n#%%\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n#%%\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n#%%\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n#%%\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n#%%\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n#%%\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n#%%\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n#%%\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n#%%\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n#%%\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n#%%\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n#%%\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n#%%\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n#%%\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n#%%\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n#%%\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n#%%\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n#%%\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n#%%\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n#%%\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n#%%\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n#%%\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n#%%\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n#%%\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n#%%\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n#%%\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)", "original_comment": "# set axis labels\n", "target_code": "plt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\nvalid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\nvalid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\nax.plot(valid_stock_sequence_data_date\n", "model": "no-comments", "intent": "# set axis labels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n#%%\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n#%%\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n#%%\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n#%%\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n#%%\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n#%%\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n#%%\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n#%%\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n#%%\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n#%%\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n#%%\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n#%%\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')", "original_comment": "# making predictions on the testing set\n", "target_code": "y_pred = lr.predict(X_test)\n", "project_metadata": {"full_name": "asabenhur/CS345", "description": "Jupyter", "topics": [], "git_url": "git://github.com/asabenhur/CS345.git", "stars": 4, "watchers": 4, "forks": 11, "created": "2020-08-11T19:32:02Z", "size": 6413, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4808835}, "last_updated": "2020-12-30T20:50:00Z"}, "annotations": [{"completed_by": {"id": 1}, "coverage": "Strongly agree", "coverage-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "y_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n", "model": "natural", "intent": "# making predictions on the testing set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n#%%\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------", "original_comment": "# Create figure and dimensions\n", "target_code": "plt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\n", "project_metadata": {"full_name": "isaacarroyov/ss_plots", "description": "Repositorio de gr\u00e1ficas realizadas en Python para mis boletines de servicio social (Ecuaciones Diferenciales y An\u00e1lisis Vectorial) || Repository of the plots made in Python for my social service bulletins (Differential Equations and Vector Calculus)", "topics": ["differential-equations", "math", "vector-analysis", "university", "python3", "python", "ecuaciones-diferenciales"], "git_url": "git://github.com/isaacarroyov/ss_plots.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-08-27T19:15:30Z", "size": 21849, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 29758848}, "last_updated": "2020-11-24T18:53:41Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig = plt.figure(figsize=(10, 10))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(X_curve, Y_curve, U_dcurve, V_dcurve)\nax.set_xlabel(r'$x$', size=20)\nax.set_ylabel(r'$y$', size=20)\nax.set_zlabel(r'$u$', size=20)\nax.tick_params(labelsize=15)\nplt.tight_layout()\nplt.show()\n", "model": "docstring", "intent": "# Create figure and dimensions"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n\ninput_shape\n\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n#%%\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n#%%\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n#%%\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n#%%\n\ninput_shape\n\n#%%\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n#%%\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n#%%\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n#%%\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n#%%\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n#%%\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n#%%\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))", "original_comment": "# Second Convolutional layer.\n", "target_code": "new_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "new_model = Sequential()\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "model": "docstring", "intent": "# Second Convolutional layer."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n\nout\n\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n\ntrain, validation = get_MNIST_data()\n\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom matplotlib import pyplot as plt\nfrom keras.initializers import VarianceScaling\nfrom keras import backend as K\nfrom keras.datasets import mnist\nfrom keras.callbacks import Callback\nfrom keras.utils import np_utils\nfrom keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D\nfrom keras.optimizers import SGD, Adam\nfrom keras.models import Sequential\nimport pdb\nimport numpy as np\nimport itertools\n\nnp.random.seed(0)\n\n# Read the simple 2D dataset files\n\n\ndef get_data_set(name):\n try:\n data = np.loadtxt(name, skiprows=0, delimiter=' ')\n except:\n return None, None, None\n np.random.shuffle(data) # shuffle the data\n # The data uses ROW vectors for a data point, that's what Keras assumes.\n _, d = data.shape\n X = data[:, 0:d-1]\n Y = data[:, d-1:d]\n y = Y.T[0]\n classes = set(y)\n if classes == set([-1.0, 1.0]):\n print('Convert from -1,1 to 0,1')\n y = 0.5*(y+1)\n print('Loading X', X.shape, 'y', y.shape, 'classes', set(y))\n return X, y, len(classes)\n\n\nclass LossHistory(Callback):\n def on_train_begin(self, logs={}):\n self.keys = ['loss', 'acc', 'val_loss', 'val_acc']\n self.values = {}\n for k in self.keys:\n self.values['batch_'+k] = []\n self.values['epoch_'+k] = []\n\n def on_batch_end(self, batch, logs={}):\n for k in self.keys:\n bk = 'batch_'+k\n if k in logs:\n self.values[bk].append(logs[k])\n\n def on_epoch_end(self, epoch, logs={}):\n for k in self.keys:\n ek = 'epoch_'+k\n if k in logs:\n self.values[ek].append(logs[k])\n\n def plot(self, keys):\n for key in keys:\n plt.plot(np.arange(len(self.values[key])), np.array(\n self.values[key]), label=key)\n plt.legend()\n\n\ndef run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs, split=0, verbose=True):\n # Model specification\n model = Sequential()\n for layer in layers:\n model.add(layer)\n # Define the optimization\n model.compile(loss='categorical_crossentropy',\n optimizer=Adam(), metrics=[\"accuracy\"])\n N = X_train.shape[0]\n # Pick batch size\n batch = 32 if N > 1000 else 1 # batch size\n history = LossHistory()\n # Fit the model\n if X_val is None:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_split=split,\n callbacks=[history], verbose=verbose)\n else:\n model.fit(X_train, y_train, epochs=epochs, batch_size=batch, validation_data=(X_val, y_val),\n callbacks=[history], verbose=verbose)\n # Evaluate the model on validation data, if any\n if X_val is not None or split > 0:\n val_acc, val_loss = history.values['epoch_val_acc'][-1], history.values['epoch_val_loss'][-1]\n print(\"\\nLoss on validation set:\" + str(val_loss) +\n \" Accuracy on validation set: \" + str(val_acc))\n else:\n val_acc = None\n # Evaluate the model on test data, if any\n if X_test is not None:\n test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=batch)\n print(\"\\nLoss on test set:\" + str(test_loss) +\n \" Accuracy on test set: \" + str(test_acc))\n else:\n test_acc = None\n return model, history, val_acc, test_acc\n\n\ndef dataset_paths(data_name):\n return [\"data/data\"+data_name+\"_\"+suffix+\".csv\" for suffix in (\"train\", \"validate\", \"test\")]\n\n# The name is a string such as \"1\" or \"Xor\"\n\n\ndef run_keras_2d(data_name, layers, epochs, display=True, split=0.25, verbose=True, trials=1):\n print('Keras FC: dataset=', data_name)\n (train_dataset, val_dataset, test_dataset) = dataset_paths(data_name)\n # Load the datasets\n X_train, y, num_classes = get_data_set(train_dataset)\n X_val, y2, _ = get_data_set(val_dataset)\n X_test, y3, _ = get_data_set(test_dataset)\n # Categorize the labels\n y_train = np_utils.to_categorical(y, num_classes) # one-hot\n y_val = y_test = None\n if X_val is not None:\n y_val = np_utils.to_categorical(y2, num_classes) # one-hot\n if X_test is not None:\n y_test = np_utils.to_categorical(y3, num_classes) # one-hot\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc, = run_keras(X_train, y_train, X_val, y_val, X_test, y_test, layers, epochs,\n split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if display:\n # plot classifier landscape on training data\n plot_heat(X_train, y, model)\n plt.title('Training data')\n plt.show()\n if X_test is not None:\n # plot classifier landscape on testing data\n plot_heat(X_test, y3, model)\n plt.title('Testing data')\n plt.show()\n # Plot epoch loss\n history.plot(['epoch_loss', 'epoch_val_loss'])\n plt.xlabel('epoch')\n plt.ylabel('loss')\n plt.title('Epoch val_loss and loss')\n plt.show()\n # Plot epoch accuracy\n history.plot(['epoch_acc', 'epoch_val_acc'])\n plt.xlabel('epoch')\n plt.ylabel('accuracy')\n plt.title('Epoch val_acc and acc')\n plt.show()\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n return X_train, y, model\n\n\ndef get_MNIST_data(shift=0):\n (X_train, y1), (X_val, y2) = mnist.load_data()\n if shift:\n size = 28+shift\n X_train = shifted(X_train, shift)\n X_val = shifted(X_val, shift)\n return (X_train, y1), (X_val, y2)\n\n\ndef shifted(X, shift):\n n = X.shape[0]\n m = X.shape[1]\n size = m + shift\n X_sh = np.zeros((n, size, size))\n plt.ion()\n for i in range(n):\n sh1 = np.random.randint(shift)\n sh2 = np.random.randint(shift)\n X_sh[i, sh1:sh1+m, sh2:sh2+m] = X[i, :, :]\n # If you want to see the shifts, uncomment\n #plt.figure(1); plt.imshow(X[i])\n #plt.figure(2); plt.imshow(X_sh[i])\n # plt.show()\n # input('Go?')\n return X_sh\n\n\ndef run_keras_fc_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n (X_train, y1), (X_val, y2) = train, test\n # Flatten the images\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m*m))\n X_val = X_val.reshape((X_val.shape[0], m*m))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n\ndef run_keras_cnn_mnist(train, test, layers, epochs, split=0.1, verbose=True, trials=1):\n # Load the dataset\n (X_train, y1), (X_val, y2) = train, test\n # Add a final dimension indicating the number of channels (only 1 here)\n m = X_train.shape[1]\n X_train = X_train.reshape((X_train.shape[0], m, m, 1))\n X_val = X_val.reshape((X_val.shape[0], m, m, 1))\n # Categorize the labels\n num_classes = 10\n y_train = np_utils.to_categorical(y1, num_classes)\n y_val = np_utils.to_categorical(y2, num_classes)\n # Train, use split for validation\n val_acc, test_acc = 0, 0\n for trial in range(trials):\n # Reset the weights\n # See https://github.com/keras-team/keras/issues/341\n session = K.get_session()\n for layer in layers:\n for v in layer.__dict__:\n v_arg = getattr(layer, v)\n if hasattr(v_arg, 'initializer'):\n initializer_func = getattr(v_arg, 'initializer')\n initializer_func.run(session=session)\n # Run the model\n model, history, vacc, tacc = run_keras(\n X_train, y_train, X_val, y_val, None, None, layers, epochs, split=split, verbose=verbose)\n val_acc += vacc if vacc else 0\n test_acc += tacc if tacc else 0\n if val_acc:\n print(\"\\nAvg. validation accuracy:\" + str(val_acc/trials))\n if test_acc:\n print(\"\\nAvg. test accuracy:\" + str(test_acc/trials))\n\n# Plotting functions\n\n\ndef plot_heat(X, y, model, res=200):\n eps = .1\n xmin = np.min(X[:, 0]) - eps\n xmax = np.max(X[:, 0]) + eps\n ymin = np.min(X[:, 1]) - eps\n ymax = np.max(X[:, 1]) + eps\n ax = tidyPlot(xmin, xmax, ymin, ymax, xlabel='x', ylabel='y')\n xl = np.linspace(xmin, xmax, res)\n yl = np.linspace(ymin, ymax, res)\n xx, yy = np.meshgrid(xl, yl, sparse=False)\n zz = np.argmax(model.predict(np.c_[xx.ravel(), yy.ravel()]), axis=1)\n im = ax.imshow(np.flipud(zz.reshape((res, res))), interpolation='none',\n extent=[xmin, xmax, ymin, ymax],\n cmap='viridis')\n plt.colorbar(im)\n for yi in set([int(_y) for _y in set(y)]):\n color = ['r', 'g', 'b'][yi]\n marker = ['X', 'o', 'v'][yi]\n cl = np.where(y == yi)\n ax.scatter(X[cl, 0], X[cl, 1], c=color, marker=marker, s=80,\n edgecolors='none')\n return ax\n\n\ndef tidyPlot(xmin, xmax, ymin, ymax, center=False, title=None,\n xlabel=None, ylabel=None):\n plt.figure(facecolor=\"white\")\n ax = plt.subplot()\n if center:\n ax.spines['left'].set_position('zero')\n ax.spines['right'].set_color('none')\n ax.spines['bottom'].set_position('zero')\n ax.spines['top'].set_color('none')\n ax.spines['left'].set_smart_bounds(True)\n ax.spines['bottom'].set_smart_bounds(True)\n ax.xaxis.set_ticks_position('bottom')\n ax.yaxis.set_ticks_position('left')\n else:\n ax.spines[\"top\"].set_visible(False)\n ax.spines[\"right\"].set_visible(False)\n ax.get_xaxis().tick_bottom()\n ax.get_yaxis().tick_left()\n eps = .05\n plt.xlim(xmin-eps, xmax+eps)\n plt.ylim(ymin-eps, ymax+eps)\n if title:\n ax.set_title(title)\n if xlabel:\n ax.set_xlabel(xlabel)\n if ylabel:\n ax.set_ylabel(ylabel)\n return ax\n\n\ndef archs(classes):\n return [[Dense(input_dim=2, units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=10, activation='relu'),\n Dense(units=10, activation='relu'),\n Dense(units=classes, activation=\"softmax\")],\n [Dense(input_dim=2, units=100, activation='relu'),\n Dense(units=100, activation='relu'),\n Dense(units=classes, activation=\"softmax\")]]\n\n\ndef plot_separator(ax, th, th_0):\n xmin, xmax = ax.get_xlim()\n ymin, ymax = ax.get_ylim()\n pts = []\n eps = 1.0e-6\n # xmin boundary crossing is when xmin th[0] + y th[1] + th_0 = 0\n # that is, y = (-th_0 - xmin th[0]) / th[1]\n if abs(th[1, 0]) > eps:\n pts += [np.array([x, (-th_0 - x * th[0, 0]) / th[1, 0]])\n for x in (xmin, xmax)]\n if abs(th[0, 0]) > 1.0e-6:\n pts += [np.array([(-th_0 - y * th[1, 0]) / th[0, 0], y])\n for y in (ymin, ymax)]\n in_pts = []\n for p in pts:\n if (xmin-eps) <= p[0] <= (xmax+eps) and (ymin-eps) <= p[1] <= (ymax+eps):\n duplicate = False\n for p1 in in_pts:\n if np.max(np.abs(p - p1)) < 1.0e-6:\n duplicate = True\n if not duplicate:\n in_pts.append(p)\n if in_pts and len(in_pts) >= 2:\n # Plot separator\n vpts = np.vstack(in_pts)\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Plot normal\n vmid = 0.5*(in_pts[0] + in_pts[1])\n scale = np.sum(th*th)**0.5\n diff = in_pts[0] - in_pts[1]\n dist = max(xmax-xmin, ymax-ymin)\n vnrm = vmid + (dist/10)*(th.T[0]/scale)\n vpts = np.vstack([vmid, vnrm])\n ax.plot(vpts[:, 0], vpts[:, 1], 'k-', lw=2)\n # Try to keep limits from moving around\n ax.set_xlim((xmin, xmax))\n ax.set_ylim((ymin, ymax))\n else:\n print('Separator not in plot range')\n\n\ndef plot_decision(data, cl, diff=False):\n layers = archs(cl)[0]\n X, y, model = run_keras_2d(\n data, layers, 10, trials=1, verbose=False, display=False)\n ax = plot_heat(X, y, model)\n W = layers[0].get_weights()[0]\n W0 = layers[0].get_weights()[1].reshape((cl, 1))\n if diff:\n for i, j in list(itertools.combinations(range(cl), 2)):\n plot_separator(ax, W[:, i:i+1] - W[:, j:j+1],\n W0[i:i+1, :] - W0[j:j+1, :])\n else:\n for i in range(cl):\n plot_separator(ax, W[:, i:i+1], W0[i:i+1, :])\n plt.show()\n\n\n# Use 10 epochs of training for each architecture and re-start the training and testing 5 times, look at the average accuracy on the validation set (reported as \"Avg. validation accuracy\" at the end of the run). But, notice the variation in scores across each run. \\\\\n# arch zero:\n\n#%%\n\nlayers = archs(2)[0]\n# print(layers)\nX_train, y, model = run_keras_2d('1', layers, 10)\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n for data_num in range(1, 5):\n print('For data_num:', data_num)\n layers = archs(2)[layer_num]\n X_train, y, model = run_keras_2d(\n str(data_num), layers, 10, verbose=False, trials=5)\n\n\n# Training for data '3' using architecture (200,200) for 100 epochs.\n\n#%%\n\nlayers = archs(2)[4]\nX_train, y, model = run_keras_2d('3', layers, 100)\n\n\n# What is the average validation accuracy (using split=0.5) for the three-class data set (data_name=3class), for each of the architectures, using 10 epochs and 5 trials as before.\n\n#%%\n\nfor layer_num in range(5):\n print('For arch: ', layer_num)\n layers = archs(3)[layer_num]\n X_train, y, model = run_keras_2d(\n '3class', layers, 10, display=False, split=0.5, verbose=False, trials=5)\n\n#%%\n\n# Using no arch 0 on 3class data\nlayers = archs(3)[0]\nX_train, y, model = run_keras_2d(\n '3class', layers, 10, display=True, split=0.25, verbose=False, trials=5)\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\nweights = layers[0].get_weights()\nW, W0 = weights\nprint(weights)\nW = np.asarray(W)\nW0 = np.asarray(W0)\nout = []\nfor x in inputs:\n x = np.asarray(x)\n val = np.dot(W.T, x)+W0\n out.append(list(val))\n\n#%%\n\nout\n\n#%%\n\ninputs = [[-1, 0], [1, 0], [0, -11], [0, 1],\n [-1, -1], [-1, 1], [1, 1], [1, -1]]\n\nx = np.array([-1, 0])\n\nmodel.predict(np.array([-1, 0]))\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n#%%\n\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# added initializer to the layer\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\",\n kernel_initializer=VarianceScaling(scale=0.001, mode='fan_in', distribution='normal', seed=None))]\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\n# Data scaled\nlayers = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n\n\ndef rescale(data):\n data_rescale = []\n for i in range(len(data)):\n data_rescale.append(data[i]/255.)\n return tuple(data_rescale)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nrun_keras_fc_mnist(train, validation, layers, epochs=10, split=0.1, trials=5)\n\n#%%\n\nfor epoch in [5, 10, 15]:\n print('for epoch:', epoch)\n layer = [Dense(input_dim=28*28, units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, layer, epoch,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\ntrain, validation = get_MNIST_data()\n\n\ndef rescale(data):\n X, y = data\n X = X/255.\n return (X, y)\n\n\ntrain = rescale(train)\nvalidation = rescale(validation)\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\narch = [Dense(input_dim=28*28, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\nrun_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\n\n#%%\n\nFC_Layer = [Dense(input_dim=48*48, units=512, activation=\"relu\"),\n Dense(units=256, activation='relu'),\n Dense(units=10, activation=\"softmax\")]\n\n#%%\n\n# Trying cnn:\nCNN_layers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(48, 48, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\n#%%\n\nlayers = [Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28, 28, 1), activation=\"relu\"),\n MaxPooling2D(pool_size=(2, 2)),\n Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n MaxPooling2D(pool_size=(2, 2)),\n Flatten(),\n Dense(units=128, activation='relu'),\n Dropout(rate=.5),\n Dense(units=10, activation='softmax')]\n\nrun_keras_cnn_mnist(train, validation, layers, epochs=1,\n split=0.1, verbose=False, trials=1)\n\n#%%", "original_comment": "# Shifted dataset\n", "target_code": "train_20, validation_20 = get_MNIST_data(shift=20)\ntrain_20 = rescale(train_20)\nvalidation_20 = rescale(validation_20)\n", "project_metadata": {"full_name": "elahea2020/6.036", "description": "Homework solutions of Intro to ML course at MIT Spring 2018", "topics": ["ml", "machine-learning", "machine-learning-algorithms", "mit", "6036", "perceptron-learning-algorithm", "rnn"], "git_url": "git://github.com/elahea2020/6.036.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2018-05-08T21:21:54Z", "size": 65530, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 18939819, "Python": 168769}, "last_updated": "2020-10-25T08:09:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "compatibility": "Strongly disagree", "compatibility-score": 0, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0}], "predicted_code": "train = shifted_mnist_data()\nvalidation = shifted_mnist_data()\nfor unit in (128, 256, 512, 1024):\n print('units: ', unit)\n arch = [Dense(input_dim=28*28, units=unit, activation=\"relu\"),\n Dense(units=10, activation=\"softmax\")]\n run_keras_fc_mnist(train, validation, arch, epochs=1,\n split=0.1, verbose=False, trials=5)\ntrain = shifted_mnist_data()\nvalidation = shifted_mnist_data()\nfor unit in (128\n", "model": "docstring", "intent": "# Shifted dataset"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n\nmodel.output_shape\n\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n\n15000/2\n\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n#%%\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n#%%\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n#%%\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n#%%\n\nmodel.output_shape\n\n#%%\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n#%%\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n#%%\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n#%%\n\n15000/2\n\n#%%\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n#%%\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n#%%", "original_comment": "# Plot a single hologram with the particles overlaid\n", "target_code": "def plot_hologram(h, img, outputs=\"none\"):\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.figure(figsize=(10, 10))\nplt.scatter(train_outputs_scaled_one, train_inputs_scaled_one,\n c=train_preds_scaled_one, cmap='viridis')\nplt.colorbar()\nplt.show()\n", "model": "natural", "intent": "# Plot a single hologram with the particles overlaid"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n\n# should be version 1.x\nprint(tf.__version__)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n#%%\n\n# should be version 1.x\nprint(tf.__version__)\n\n#%%", "original_comment": "# the following directive activates inline plotting\n", "target_code": "get_ipython().run_line_magic('matplotlib', 'inline')\n", "project_metadata": {"full_name": "NeilAlishev/HiCPredictor", "description": "Predict Hi-C maps from the DNA sequence using deep convolutional neural networks", "topics": [], "git_url": "git://github.com/NeilAlishev/HiCPredictor.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-01-12T17:39:25Z", "size": 25045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 9881823, "Python": 17479}, "last_updated": "2020-11-13T16:32:28Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "config = tf.ConfigProto()\nconfig.gpu_options.allow_growth = True\nset_session(tf.Session(config=config))\n", "model": "no-comments", "intent": "# activate inline plotting"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\n\n\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n\ncorpus\n\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n\npred\n\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n#%%\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n#%%\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n#%%\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n#%%\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n#%%\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\n\n\n#%%\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n#%%\n\ncorpus\n\n#%%\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n#%%\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n#%%\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n#%%\n\npred\n\n#%%\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n#%%\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values", "original_comment": " # as usual, we use text-embeddings\n", "target_code": " text = text_embedding(text)\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "sentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\ncorpus = []\nlabels = []\nfor line in sentiment_dataset[1:1000]:\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"\n", "model": "no-comments", "intent": " # use text-embeddings"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## 1. Scala's real-world project repository data\n#

With almost 30k commits and a history spanning over ten years, Scala is a mature programming language. It is a general-purpose programming language that has recently become another prominent language for data scientists.

\n#

Scala is also an open source project. Open source projects have the advantage that their entire development histories -- who made changes, what was changed, code reviews, etc. -- publicly available.

\n#

We're going to read in, clean up, and visualize the real world project repository of Scala that spans data from a version control system (Git) as well as a project hosting site (GitHub). We will find out who has had the most influence on its development and who are the experts.

\n#

The dataset we will use, which has been previously mined and extracted directly from GitHub, is comprised of two files:

\n#
    \n#
  1. pulls.csv contains the basic information about the pull requests.
  2. \n#
  3. pull_files.csv contains the files that were modified by each pull request.
  4. \n#
\n\n#%%\n\n# Importing pandas\nimport pandas as pd\n\n# Loading in the data\npulls = pd.read_csv('datasets/pulls.csv')\npull_files = pd.read_csv('datasets/pull_files.csv')\n\n\n# ## 2. Cleaning the data\n#

The raw data extracted from GitHub contains dates in the ISO8601 format. However, pandas imports them as regular strings. To make our analysis easier, we need to convert the strings into Python's DateTime objects. DateTime objects have the important property that they can be compared and sorted.

\n#

The pull request times are all in UTC (also known as Coordinated Universal Time). The commit times, however, are in the local time of the author with time zone information (number of hours difference from UTC). To make comparisons easy, we should convert all times to UTC.

\n\n#%%\n\n# Convert the date for the pulls object\npulls['date'] = pd.to_datetime(pulls['date'], utc=True)\n\n\n# ## 3. Merging the DataFrames\n#

The data extracted comes in two separate files. Merging the two DataFrames will make it easier for us to analyze the data in the future tasks.

\n\n#%%\n\n# Merge the two DataFrames\ndata = pulls.merge(pull_files, on=['pid'])\nprint(data.head())\n\n\n# ## 4. Is the project still actively maintained?\n#

The activity in an open source project is not very consistent. Some projects might be active for many years after the initial release, while others can slowly taper out into oblivion. Before committing to contributing to a project, it is important to understand the state of the project. Is development going steadily, or is there a drop? Has the project been abandoned altogether?

\n#

The data used in this project was collected in January of 2018. We are interested in the evolution of the number of contributions up to that date.

\n#

For Scala, we will do this by plotting a chart of the project's activity. We will calculate the number of pull requests submitted each (calendar) month during the project's lifetime. We will then plot these numbers to see the trend of contributions.

\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Create a column that will store the month and the year, as a string\npulls['month_year'] = pulls['date'].dt.strftime('%m%Y')\n\n# Group by month_year and count the pull requests\ncounts = pulls.groupby('month_year').count()\n\n# Plot the results\ncounts.plot()\n\n\n# ## 5. Is there camaraderie in the project?\n#

The organizational structure varies from one project to another, and it can influence your success as a contributor. A project that has a very small community might not be the best one to start working on. The small community might indicate a high barrier of entry. This can be caused by several factors, including a community that is reluctant to accept pull requests from \"outsiders,\" that the code base is hard to work with, etc. However, a large community can serve as an indicator that the project is regularly accepting pull requests from new contributors. Such a project would be a good place to start.

\n#

In order to evaluate the dynamics of the community, we will plot a histogram of the number of pull requests submitted by each user. A distribution that shows that there are few people that only contribute a small number of pull requests can be used as in indicator that the project is not welcoming of new contributors.

\n\n#%%\n\n# Required for matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# Group by the submitter\nby_user = pulls.groupby('user').count()\n\n# Plot the histogram\nby_user.plot(kind='hist')\n\n\n# ## 6. What files were changed in the last ten pull requests?\n#

Choosing the right place to make a contribution is as important as choosing the project to contribute to. Some parts of the code might be stable, some might be dead. Contributing there might not have the most impact. Therefore it is important to understand the parts of the system that have been recently changed. This allows us to pinpoint the \"hot\" areas of the code where most of the activity is happening. Focusing on those parts might not the most effective use of our times.

\n\n#%%\n\n# Identify the last 10 pull requests\nlast_10 = pulls.nlargest(10, 'date')\n\n# Join the two data sets\njoined_pr = last_10.merge(pull_files, on='pid')\n\n# Identify the unique files\nfiles = set(joined_pr['file'])\n\n# Print the results\nfiles\n\n\n# ## 7. Who made the most pull requests to a given file?\n#

When contributing to a project, we might need some guidance. We might find ourselves needing some information regarding the codebase. It is important direct any questions to the right person. Contributors to open source projects generally have other day jobs, so their time is limited. It is important to address our questions to the right people. One way to identify the right target for our inquiries is by using their contribution history.

\n#

We identified src/compiler/scala/reflect/reify/phases/Calculate.scala as being recently changed. We are interested in the top 3 developers who changed that file. Those developers are the ones most likely to have the best understanding of the code.

\n\n#%%\n\n# This is the file we are interested in:\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Identify the commits that changed the file\nfile_pr = data[data['file'] == file]\n\n# Count the number of changes made by each developer\nauthor_counts = file_pr.groupby('user').count()\n\n# Print the top 3 developers\nprint(author_counts.nlargest(3, 'pid'))\n\n\n# ## 8. Who made the last ten pull requests on a given file?\n#

Open source projects suffer from fluctuating membership. This makes the problem of finding the right person more challenging: the person has to be knowledgeable and still be involved in the project. A person that contributed a lot in the past might no longer be available (or willing) to help. To get a better understanding, we need to investigate the more recent history of that particular part of the system.

\n#

Like in the previous task, we will look at the history of src/compiler/scala/reflect/reify/phases/Calculate.scala.

\n\n#%%\n\nfile = 'src/compiler/scala/reflect/reify/phases/Calculate.scala'\n\n# Select the pull requests that changed the target file\nfile_pr = pull_files[data['file'] == file]", "original_comment": "# Merge the obtained results with the pulls DataFrame\n", "target_code": "joined_pr = file_pr.merge(pulls, on='pid')\n", "project_metadata": {"full_name": "ChristianNogueira/datacamp_projects", "description": "DataCamp Projects", "topics": ["datacamp"], "git_url": "git://github.com/ChristianNogueira/datacamp_projects.git", "stars": 17, "watchers": 17, "forks": 13, "created": "2018-01-17T16:58:27Z", "size": 8441, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12129948}, "last_updated": "2020-08-21T20:03:31Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "merged_pr = pd.merge(file_pr, author_counts, on='user')\nmerged_pr.head()\n", "model": "docstring", "intent": "# Merge the obtained results with the pulls DataFrame"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\nsavelist # list of lists\n\n\n# we can check the items of the list we created\nsavelist[1]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n#%%\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n#%%\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n#%%\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n#%%\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n#%%\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n#%%\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n#%%\n\nsavelist # list of lists\n\n#%%\n\n# we can check the items of the list we created\nsavelist[1]\n\n#%%", "original_comment": "# access items from a list\n", "target_code": "savelist[0][1]\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "with open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\", \"r\") as infile:\n content = infile.read()\nprint(content)\n", "model": "no-comments", "intent": "# access items from a list"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata.isnull().sum()\n\n\ndata.dropna(inplace=True)\n\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n#%%\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata.isnull().sum()\n\n#%%\n\ndata.dropna(inplace=True)\n\n#%%\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n#%%\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)", "original_comment": "# ### Compute the explained variance for new data set.\n", "target_code": "pca.explained_variance_\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig = plt.figure(figsize=(10, 10))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(pca_comps[:, 0], pca_comps[:, 1], pca_comps[:, 2])\nplt.show()\n", "model": "no-comments", "intent": "# Compute the explained variance for new data set."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # PART 2: Intermediate Data Processing\n\n# In this Jupyter Notebook, we further investigate the interim datasets through a **processing** lens: we analyze, transform, scale, encode, reduce, and otherwise munge our data to prepare it for predictive analysis and machine learning-based modeling.\n#\n# - **NOTE**: Before working through this notebook, please ensure that you have all necessary dependencies as denoted in [Section A: Imports and Initializations](#section-A) of this notebook.\n#\n# - **NOTE**: Before working through Sections A-D of this notebook, please run all code cells in [Appendix A: Supplementary Custom Objects](#appendix-A) to ensure that all relevant functions and objects are appropriately instantiated and ready for use.\n#\n# ---\n\n# ## \ud83d\udd35 TABLE OF CONTENTS \ud83d\udd35 \n#\n# Use this **table of contents** to navigate the various sections of the processing notebook.\n#\n# #### 1. [Section A: Imports and Initializations](#section-A)\n#\n# All necessary imports and object instantiations for data processing.\n#\n# #### 2. [Section B: Specialized Encoding](#section-B)\n#\n# Data encoding operations, including value range mapping,\n# correlational plotting, and categorical encoding.\n#\n# #### 3. [Section C: Data Scaling & Transformation](#section-C)\n#\n# Data transformation techniques, including standard scaling/normalization\n# and feature reduction techniques.\n#\n# #### 4. [Section D: Saving Our Processed Datasets](#section-D)\n#\n# Saving processed data states for further access.\n#\n# #### 5. [Appendix A: Supplementary Custom Objects](#appendix-A)\n#\n# Custom Python object architectures used throughout the data processing.\n#\n# ---\n\n# ## \ud83d\udd39 Section A: Imports and Initializations \n\n# General Importations for Data Manipulation and Visualization.\n\n#%%\n\nfrom custom_structures import corrplot_\nfrom dataset_processor import Dataset_Processor\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy import stats\n\n\n# Algorithms for Data Scaling and Feature Reduction.\n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\n\n\n# Custom Algorithmic Structures for Processed Data Visualization.\n\n#%%\n\nimport sys\nsys.path.append(\"../structures/\")\n\n\n# #### Instantiate Our Processor Engine\n\n# Custom Processor Class for Target-Oriented Data Modification.\n#\n# **NOTE**: Please refer to _Appendix A: Supplementary Custom Objects_ to view the fully implemented processor object.\n\n#%%\n\nproc = Dataset_Processor()\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section B: Data Encoding \n\n# #### Read Our Preprocessed Data Into Conditional DataFrame(s)\n#\n# **Call** `.load_data()` **method to load in all conditionally separated interim datasets.**\n#\n# _NOTE_: Currently loading in both datasets independently using defaulted condition `which=\"both\"`.\n\n#%%\n\n(df_train_i, df_test_i) = proc.load_data()\n\n\n# #### Produce Correlation Matrix\n#\n# **NOTE**: Zooming in manually is required to view exact correlational values due to figure sizing constraints.\n\n#%%\n\ncorrplot_(df_train_i, figsize=(50, 50))\n\n\n# #### Outlier Detection and Removal Using _Tukey's Method_\n#\n# The next step of processing involves removing outliers from our dataset using the _Tukey Method_, which states that data that resides outside of our Tukey fences (our IQR fences (\u00b1) multiplied by 150%) can be safely eliminated to approximately maximize signal-to-noise ratio.\n#\n# **NOTE**: For this project, _Tukey's Method_ is **not recommended** due to extensive loss of data.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"tukey\", result=\"omit\")\n\n\n# #### Outlier Detection and Removal Using _Z-Score Filtering_\n#\n# The next step of processing involves removing outliers from our dataset using _Z-Score Filtering_, which states that data that resides outside of three standard deviations (approximately at the 0.3rd and 99.7th percentiles) can be safely eliminated to approximately maximize signal-to-noise ratio.\n\n#%%\n\nfor feature in df_train_i:\n outlier_detector(df_train_i, feature, how=\"stddev\", result=\"omit\")\n\n\n# #### Null Value Detection in Case of Imputation\n#\n# Identify recognizeable null values across each feature in dataset and conditionally alert user.\n\n#%%\n\nnull_detector(df_train_i, alert=True)\nnull_detector(df_test_i, alert=True)\n\n\n# No null values have been detected across our entire dataset (_training_ and _testing_), which is great! Let's check the testing data as well.\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section C: Data Scaling & Transformation \n\n# **INCLUDED PROCESSES:**\n#\n# - **Data Scaling** with `StandardScaler()`\n# - **Dimensionality Reduction** with `PCA()`\n\n# Conditional separation of training and testing datasets into `X` and `y` data.\n\n#%%\n\nTARGET_VARIABLE = \"0.4\"\n\nX_train_pro, y_train_pro = dataset_separator(df_train_i, TARGET_VARIABLE)\nX_test_pro, y_test_pro = dataset_separator(df_test_i, TARGET_VARIABLE)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Fully processed dataset (X) ready to save.**\n# **Fully processed labels (y) ready to save.**\n\n# #### `StandardScaler()` Fits and Transforms Full X-Data Into Scaled Datasets\n\n#%%\n\nX_train_sca, X_test_sca = dataset_scaler(X_train_pro, X_test_pro)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Scaled dataset (X) ready to save.**\n\n# #### Principal Component Analysis with Threshold of 75% on Genomic Features\n\n# Use concatenation techniques to marry the training and testing X-datasets temporarily for synchronous dimensionality reduction.\n\n#%%\n\nX_full_sca = pd.concat([X_train_sca, X_test_sca], keys=[\"train\", \"test\"])\n\n\n# **Call** `dimensionality_reducer()` **function to grab principal component features from dataset that represent given threshold percentage of explained target variance.**\n#\n# **NOTE**: Performing dimensionality reduction with `PCA()` restricts indexing on original training and testing datasets, necessitating the use of custom training/testing-splitting objects.\n\n#%%\n\nX_full_red = dimensionality_reducer(X_full_sca, THRESHOLD=0.75)\n\n\n# **Call** `train_test_split()` **to conditionally split feature-reduced dataset into training and testing datasets.**\n\n#%%\n\ntrain_size = float(X_train_sca.shape[0]) / float(X_full_red.shape[0])\ntest_size = 1 - train_size\n\nX_train_red, X_test_red = train_test_split(\n X_full_red, train_size=train_size, test_size=test_size)\n\n\n# #### \ud83d\udd38 CHECKPOINT \ud83d\udd38\n#\n# **Dimensionally-reduced dataset (X) ready to save.**\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Section D: Saving Our Processed Datasets \n\n#%%\n\nREL_PATH_PROC_DATA = \"../data/processed/\"\nDATA_X, DATA_y = \"X/\", \"y/\"\nSUBDIR_PROC, SUBDIR_SCA, SUBDIR_RED = \"processed/\", \"scaled/\", \"reduced/\"\n\nX_TRAIN_PROC, X_TEST_PROC = \"train_pXp\", \"test_pXp\"\nX_TRAIN_SCA, X_TEST_SCA = \"train_pXs\", \"test_pXs\"\nX_TRAIN_RED, X_TEST_RED = \"train_pXr\", \"test_pXr\"\ny_TRAIN_PROC, y_TEST_PROC = \"train_pyp\", \"test_pyp\"\n\n\n# #### Saving Data: _Fully Processed X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TRAIN_PROC)\nproc.save_dataset(X_test_pro, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_PROC + X_TEST_PROC)\n\n\n# #### Saving Data: _Scaled X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TRAIN_SCA)\nproc.save_dataset(X_test_sca, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_SCA + X_TEST_SCA)\n\n\n# #### Saving Data: _Dimensionally Reduced X-Datasets_\n\n#%%\n\nproc.save_dataset(X_train_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TRAIN_RED)\nproc.save_dataset(X_test_red, REL_PATH_PROC_DATA +\n DATA_X + SUBDIR_RED + X_TEST_RED)\n\n\n# #### Saving Data: _Fully Processed Targets (y)_\n\n#%%\n\nproc.save_dataset(y_train_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\nproc.save_dataset(y_test_pro, REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TEST_PROC)\n\n\n# ##### [(back to top)](#TOC)\n#\n# ---\n\n# ## \ud83d\udd39 Appendix A: Supplementary Custom Objects \n\n# #### A[1]: 6Nomads Dataset Processor.\n#\n# To view the **Data Processor Engine**, please follow the following steps:\n#\n# 1. Navigate to the `structures` sibling directory.\n# 2. Access the `dataset_processor.py` file.\n# 3. View the `Dataset_Processor()` object architecture.\n\n# #### A[2]: Feature Visualizer.\n#\n# Function to produce value-based histogram on each feature.\n\n#%%\n\ndef feature_visualizer(dataset, feature, buckets=20):\n \"\"\" Function to produce bar-chart histogram-based visualization from single feature of dataset. \"\"\"\n plt.hist(dataset[feature].values, bins=buckets)\n plt.show()\n\n\n# #### A[3]: Outlier Removal using Custom Detection Method.\n#\n# Function to filter data within feature of DataFrame using Tukey's IQR Method or Normal-distribution-based sigma removal.\n\n#%%\n\ndef outlier_detector(dataset, feature, how=\"tukey\", result=\"omit\", percentile=25, sigma=3):\n \"\"\"\n Function to filter data within feature of DataFrame.\n\n INPUT(S):\n {dataset}:\n - pd.DataFrame\n {feature}:\n - str\n {how}:\n - str(tukey)\n - str(lookup)\n {result}\n - str(omit)\n - str(mean)\n\n OUTPUT(S):\n - Nonetype: \n \"\"\"\n data = sorted(dataset[feature].values)\n if how == \"tukey\":\n Q1, Q3 = np.percentile(data, [50 - percentile, 50 + percentile])\n IQR = Q3 - Q1\n LOWER_FENCE, UPPER_FENCE = (Q1 - (1.5 * IQR), Q3 + (1.5 * IQR))\n elif how == \"stddev\":\n LOWER_FENCE, UPPER_FENCE = (\n np.mean(data) - (sigma * np.std(data)), np.mean(data) + (sigma * np.std(data)))\n\n ARG_LOWER_OUTLIERS, ARG_UPPER_OUTLIERS = (\n dataset[feature] < LOWER_FENCE), (dataset[feature] > UPPER_FENCE)\n if result == \"omit\":\n dataset.drop(dataset[ARG_LOWER_OUTLIERS |\n ARG_UPPER_OUTLIERS].index, inplace=True)\n\n\n# #### A[4]: Null Value Detector with Custom User Alerts.\n#\n# Function to detect null values across each feature across input dataset and alert user based on null value presence.\n\n#%%\n\ndef null_detector(dataset, alert=True):\n \"\"\" Function to detect null values across all features of input dataset. \"\"\"\n for feature in dataset:\n null_count = len(dataset[dataset[feature].isna()])\n if null_count > 0:\n print(\"FEATURE='{}': {} Null Values Detected!\".format(\n feature, null_count))\n else:\n pass\n\n\n# #### A[5]: Dataset Separator Function for X-y Consolidation.\n#\n# Function to break input dataset into X- and y-datasets for predictive analysis.\n\n#%%\n\ndef dataset_separator(dataset, target):\n \"\"\" Function to split input dataset into X and y datasets. \"\"\"\n X = dataset.drop(columns=target)\n y = dataset[target]\n return X, y\n\n\n# #### A[6]: Dataset Scaling Function for X-Data Reshaping.\n#", "original_comment": "# Function to scale input data (`X`) for predictive purposes.\n", "target_code": "from sklearn.preprocessing import StandardScaler\n\ndef dataset_scaler(training_data, testing_data, obj=StandardScaler):\n \"\"\" Function to scale X-data using custom input algorithm. \"\"\"\n SCALED_FEATURES = [feature + \"_sca\" for feature in training_data]\n scaler = obj()\n scaler.fit(training_data)\n X_train_sca = pd.DataFrame(scaler.transform(\n training_data).T, SCALED_FEATURES).T\n X_test_sca = pd.DataFrame(scaler.transform(\n testing_data).T, SCALED_FEATURES).T\n return X_train_sca, X_test_sca\n", "project_metadata": {"full_name": "AakashSudhakar/6nomads-interview-project", "description": "Interview project repository for data analysis and prediction for 6Nomads data. ", "topics": ["data-analysis", "data-processing", "data-science", "machine-learning", "data-structures"], "git_url": "git://github.com/AakashSudhakar/6nomads-interview-project.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2019-09-06T05:04:40Z", "size": 385, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 545554, "Python": 21164}, "last_updated": "2020-05-13T23:33:12Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "dataset = pd.read_csv(REL_PATH_PROC_DATA +\n DATA_y + SUBDIR_PROC + y_TRAIN_PROC)\ndataset_separator(dataset, True)\n", "model": "no-comments", "intent": "# Function to scale input data (`X`) for predictive purposes."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))", "original_comment": "# Make a copy in the specific subfolder\n", "target_code": "df_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_new = pd.DataFrame()\ndf_new['Mobility'] = df_Mobility_rolling\ndf_new['Growth'] = df_GrowthCases\ndf_new['Cases'] = df_CasesCapita\n", "model": "docstring", "intent": "# Make a copy in the specific subfolder"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n\ntrain_df.head()\n\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport numpy as np # For numerical computation\nimport pandas as pd # For data manipulation\nimport matplotlib.pyplot as plt # For data manipulation\nimport os # For file manipulation\nimport keras # For creating CNNs\n\n# To slpit training data into train and validation set\nfrom sklearn.model_selection import train_test_split\n# For converting labels into their one-hot representations\nfrom keras.utils import to_categorical\n\nfrom keras.models import Sequential # Sequential model is a stack of layers\n# Convolutional and Maxpooling layers for CNNs\nfrom keras.layers import Conv2D, MaxPooling2D\n# Dense-Densly connected NN layer, Dropout-Reduces overfittiing\nfrom keras.layers import Dense, Dropout\n# Adds a channel dimension to the input\nfrom keras.layers import Flatten, BatchNormalization\n\n#%%\n\n# Importing the training and test dataset\ntrain_df = pd.read_csv('./fashion-mnist_train.csv')\ntest_df = pd.read_csv('./fashion-mnist_test.csv')\n\n#%%\n\ntrain_df.head()\n\n#%%\n\n# converting all the columns other than label into a numpy array\ntrain_data = np.array(train_df.iloc[:, 1:])\ntest_data = np.array(test_df.iloc[:, 1:])\n\n# Converting all the labels into categorical labels\ntrain_labels = to_categorical(train_df.iloc[:, 0])\ntest_labels = to_categorical(test_df.iloc[:, 0])\n\n#%%\n\nrows, cols = 28, 28 # Size of images\n\n# Reshaping the test and train data\ntrain_data = train_data.reshape(train_data.shape[0], rows, cols, 1)\ntest_data = test_data.reshape(test_data.shape[0], rows, cols, 1)", "original_comment": "# To cast data into float32 type\n", "target_code": "train_data = train_data.astype('float32')\n", "project_metadata": {"full_name": "aditya2000/MNIST-Fashion-", "description": null, "topics": [], "git_url": "git://github.com/aditya2000/MNIST-Fashion-.git", "stars": 3, "watchers": 3, "forks": 3, "created": "2019-07-10T10:06:01Z", "size": 40, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 76938}, "last_updated": "2020-09-28T23:05:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train_data = train_data.astype('float32')\ntest_data = test_data.astype('float32')\ntrain_data /= 255\ntest_data /= 255\n", "model": "no-comments", "intent": "# To cast data into float32 type"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Why tidy data is useful for exploratory analysis\n#\n# We **explore** data to find interesting **patterns** by:\n# - **Visualizing** (with tables or charts) **individuals, distributions or aggregations of numerical values (measures)**\n# - **Splitting by categorical variables (dimensions)**, which can include:\n# - separating subsets spatially along an axis,\n# - distinguishing by color,\n# - or making separate plots (small multiples) in columns or rows\n#\n# ### Tidy data makes this sort of exploration and analysis easy!\n#\n# Tidy data's structure of *one variable per column* and *one observation per row* makes it easy to do this exploration by making it simple to aggregate and visualize. These same procedures would be hard if, say, we had multiple observations in a single row, or if the same type of variable were split across multiple columns!\n\n# ---\n#\n# *To preserve the mystery, select from the notebook menus*\n#\n# `Edit -> Clear All Outputs`\n#\n# ---\n\n#%%\n\nimport seaborn as sns", "original_comment": "# This makes a white background with grid lines\n", "target_code": "import seaborn as sns\n\nsns.set_style(\"whitegrid\")\n", "project_metadata": {"full_name": "emonson/pandas-datamatters", "description": "Python for Tabular Data and Visualization \u2013 Data Matters 2020", "topics": [], "git_url": "git://github.com/emonson/pandas-datamatters.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-12-02T18:35:22Z", "size": 5862, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1794056}, "last_updated": "2021-01-05T16:21:04Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.set(style=\"whitegrid\")\n", "model": "natural", "intent": "# Make a white background with grid lines"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n#%%\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker.estimator import Estimator\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n#%%\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)", "original_comment": "# #### Submit the job\n", "target_code": "from sagemaker.estimator import Estimator\n\nestimator = Estimator(role=role,\n train_instance_count=n_instances,\n train_instance_type=instance_type,\n image_name=image_name,\n hyperparameters=hyperparameters)\nestimator.fit()\n", "project_metadata": {"full_name": "daniel-fudge/DRL-Portfolio-Optimization-Custom", "description": "A portfolio optimization framework leveraging Deep Reinforcement Learning (DRL) and a custom trading environment", "topics": [], "git_url": "git://github.com/daniel-fudge/DRL-Portfolio-Optimization-Custom.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-06-12T22:27:29Z", "size": 35064, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1170339, "Python": 39958, "Shell": 4637}, "last_updated": "2020-11-01T22:06:49Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "get_ipython().run_cell_magic('time', '',\n '\\nimport sagemaker\\nfrom sagemaker import get_execution_role\\n\\nrole = get_execution_role()\\nprint(role)\\nsess = sagemaker.Session()')\n", "model": "natural", "intent": "# Submit the job"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n#%%\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n#%%\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n#%%\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n#%%\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n#%%\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n#%%\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n#%%\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n#%%\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n#%%\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n#%%\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n#%%\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n#%%\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n#%%\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n#%%\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n#%%\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n#%%\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)", "original_comment": "# normalize all numeric attributes to the range [0,1]\n", "target_code": "ori_dataset_num_processed = (\n numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2, "coverage": "Strongly disagree", "coverage-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "ori_dataset.describe()\n", "model": "no-comments", "intent": "# normalize all numeric attributes to the range [0,1]"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n\n# ##Inspecting the data\n\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n\nsns.countplot(x='Class', data=dat)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n#%%\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n#%%\n\n# ##Inspecting the data\n\n#%%\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n#%%\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n#%%\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n#%%\n\nsns.countplot(x='Class', data=dat)", "original_comment": "# We won't be using \"Time\" variable\n", "target_code": "dat = dat.drop(['Time'], 1)\n", "project_metadata": {"full_name": "dpanagop/ML_and_AI_examples", "description": null, "topics": [], "git_url": "git://github.com/dpanagop/ML_and_AI_examples.git", "stars": 2, "watchers": 2, "forks": 2, "created": "2019-07-16T10:55:13Z", "size": 12192, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5902376}, "last_updated": "2020-11-24T20:45:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "dat = dat.drop(['Time', 'Amount'], axis=1)\n", "model": "no-comments", "intent": "# We won't be using \"Time\" variable"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n\n# Check shape.\ncaravan_df_raw.shape\n\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n#%%\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n#%%\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n#%%\n\n# Check shape.\ncaravan_df_raw.shape\n\n#%%\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n#%%\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n#%%\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n#%%\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n#%%\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n#%%\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n#%%\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n#%%", "original_comment": "# Look globally at correlation of features.\n", "target_code": "corr = caravan_df_raw.corr()\n", "project_metadata": {"full_name": "jonrossi/caravan-insurance", "description": "Exploration and analysis of the Caravan Insurance dataset", "topics": [], "git_url": "git://github.com/jonrossi/caravan-insurance.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-09-23T17:40:57Z", "size": 951, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1260942}, "last_updated": "2020-10-31T21:58:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "caravan_df = caravan_df_raw.drop(['Purchase'], axis=1)\n", "model": "no-comments", "intent": "# Look globally at correlation of features."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\ndt_cutoff_training_COVID = datetime(2020, 5, 31, tzinfo=pytz.utc)\n#dt_cutoff_training_COVID = datetime(2020,7,25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98May31JHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n\n# Make a copy in the specific subfolder\ndf_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n\n#%%\n\n# Get processed LogCases DataFrame\ndf_LogCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogCases.csv'), index_col='timestamp')\ndf_LogCases.index = pandas.to_datetime(df_LogCases.index)\ndf_LogCases.columns = df_LogCases.columns.astype(int)\ndf_LogCases.columns.name = 'pairs_id'\ndf_LogCases.tail()\n\n#%%\n\n# Get processed GrowthCases DataFrame\ndf_GrowthCases = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_GrowthCases.csv'), index_col='timestamp')\ndf_GrowthCases.index = pandas.to_datetime(df_GrowthCases.index)\ndf_GrowthCases.columns = df_GrowthCases.columns.astype(int)\ndf_GrowthCases.columns.name = 'pairs_id'\ndf_GrowthCases.tail()\n\n#%%\n\n# Get processed CasesCapita DataFrame\ndf_CasesCapita = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_CasesCapita.csv'), index_col='timestamp')\ndf_CasesCapita.index = pandas.to_datetime(df_CasesCapita.index)\ndf_CasesCapita.columns = df_CasesCapita.columns.astype(int)\ndf_CasesCapita.columns.name = 'pairs_id'\ndf_CasesCapita.tail()\n\n#%%\n\n# Get processed Mobility DataFrame\ndf_Mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_Mobility.csv'), index_col='timestamp')\ndf_Mobility.index = pandas.to_datetime(df_Mobility.index)\ndf_Mobility.columns = df_Mobility.columns.astype(int)\ndf_Mobility.columns.name = 'pairs_id'\ndf_Mobility.tail()\n\n#%%\n\n# Get processed LogMobility DataFrame\ndf_LogMobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_LogMobility.csv'), index_col='timestamp')\ndf_LogMobility.index = pandas.to_datetime(df_LogMobility.index)\ndf_LogMobility.columns = df_LogMobility.columns.astype(int)\ndf_LogMobility.columns.name = 'pairs_id'\ndf_LogMobility.tail()\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Pairwise distance between county centroids\n\ndef haversine(lat1,lon1,lat2,lon2):\n #This uses the \u2018haversine\u2019 formula to calculate the great-circle distance between two points \u2013 that is, \n #the shortest distance over the earth\u2019s surface \u2013 giving an \u2018as-the-crow-flies\u2019 distance between the points \n #(ignoring any hills they fly over, of course!).\n #Haversine\n #formula: a = sin\u00b2(\u0394\u03c6/2) + cos \u03c61 \u22c5 cos \u03c62 \u22c5 sin\u00b2(\u0394\u03bb/2)\n #c = 2 \u22c5 atan2( \u221aa, \u221a(1\u2212a) )\n #d = R \u22c5 c\n #where \u03c6 is latitude, \u03bb is longitude, R is earth\u2019s radius (mean radius = 6,371km);\n #note that angles need to be in radians to pass to trig functions!\n R = 6371.0088 #km\n lat1,lon1,lat2,lon2 = map(numpy.radians, [lat1,lon1,lat2,lon2])\n\n dlat = lat2 - lat1\n dlon = lon2 - lon1\n a = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * numpy.sin(dlon/2) **2\n c = 2 * numpy.arctan2(a**0.5, (1-a)**0.5)\n d = R * c\n return round(d,4)\n\nhaversine_vec = numpy.vectorize(haversine)\n\n# Get the polygon centroids\ndf_region['centroid'] = df_region['poly'].apply(lambda x: x.centroid)\n\n# Extract the lats and lons of the centroids\ndf_centroids = df_region[['pairs_id', 'centroid']].set_index('pairs_id')\nlons = df_centroids['centroid'].apply(lambda x: x.coords.xy[0][0]).values\nlats = df_centroids['centroid'].apply(lambda x: x.coords.xy[1][0]).values\ndf_centroids.tail()\n\n# Empty array to hold the results of pointwise Haversine\ndistance = numpy.zeros((len(lons), len(lons)))\ndistance[:] = numpy.nan\n\n# Haversine distance\nfor i, (lon1, lat1) in enumerate(zip(lons, lats)):\n distance[i, :i+1] = haversine_vec(lats[:i+1], lons[:i+1], lat1, lon1)\n \n# Numpy to Pandas and filling in nan\ndf_distance = pandas.DataFrame(distance)\ndf_distance = df_distance.fillna(pandas.DataFrame(distance.T)) # Since we only calculated half the values\ndf_distance.columns = list(df_centroids.index)\ndf_distance.index = list(df_centroids.index)\n\n# Write to disk\ndf_distance.to_csv('data/df_distance.csv')\n\"\"\"\n\n#%%\n\n# Read county-to-county distance matrix from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\n\n# Make a copy in the specific subfolder\ndf_distance.to_csv(os.path.join(data_subdirectory, 'df_distance.csv'))\n\n# Read back from disk\ndf_distance = pandas.read_csv('data/df_distance.csv', index_col=0)\ndf_distance.columns = list(df_distance.columns.astype(int))\n\ndf_distance.tail()\n\n#%%\n\n# Diffusion Term: Model the spread of covid19 from highly infected counties to other counties\n\nfill_value = -4\n# Log new cases\ndf_log_cases = df_LogCases.fillna(fill_value)\nlogCases = df_log_cases.values\n\n# Not all pairs_ids are actually used in cases only these: df_log_cases.columns.values\ndf_distance_used = df_distance[df_log_cases.columns.values].T[df_log_cases.columns.values]\ndistance_used = df_distance_used.values\n# avoid division by zero for diagonal elements\ndistance_used[distance_used == 0] = numpy.nan\n\n# Clip to avoid outliers through division by small number\ndistance_used = numpy.clip(distance_used, a_min=10, a_max=None)\n\n# Population (total for county)\ndf_pop = df_population.set_index('pairs_id').T[df_log_cases.columns.values]\npop = df_pop.values[0]\n\n# Use 3D array for speedy calculation. Dimension 0 is time\nlogCases1 = logCases[:, :, None] # other counties\npop1 = pop[None, :, None] # other counties\n\n# casesCapita0 = cases_capita[:, None, :] # own county\nlogCases0 = logCases[:, None, :] # own county\nlogCases_null = logCases0.copy() # own county\nlogCases_null[:] = fill_value # this makes sure the diagonal elements are nan\n\nweighted_difference = ((logCases1 - logCases0) * pop1 / 1e5\n / distance_used[None, :, :])\n\n# Sum up the weighted differences\nsummed = numpy.nansum(weighted_difference, axis=1) # sum over other counties\n\ndf_Diffusion = df_log_cases.copy() # Just to get the indices and columns right\ndf_Diffusion[:] = summed", "original_comment": "# Write to disk\n", "target_code": "df_Diffusion.to_csv(os.path.join(data_subdirectory, 'df_Diffusion.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_Diffusion.to_csv('data/df_diffusion.csv')\n", "model": "natural", "intent": "# Write to disk"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', random_state=random_seed)\n\n\n# Train or fit the SVM classifier using the training dataset features and labels:\n\n#%%\n\n# train / fit the Support Vector Machine classifier\nsvm.fit(x_train, y_train)\n\n\n# #### 1.3.4. Evaluation of the trained Support Vector Machine Classifier\n\n# After fitting the training data, the optimal seperating hyperplane $H^{*}$ learned by the SVM model can then be used to predict the corresponding class labels $y_{i}'$ of so far unknown observations $x_{i}'$. We will utilize the trained model to predict the class labels of the remaining observations contained in the evaluation dataset:\n\n#%%\n\ny_pred = svm.predict(x_eval)\n\n\n# Let's have a look at the class labels $y_{i}'$ **predicted** by the SVM classifier on the evaluation dataset:\n\n#%%\n\ny_pred\n\n\n# As well as the **true** class labels $y_{i}$ as contained in the evaluation dataset:\n\n#%%\n\ny_eval\n\n\n# Ok, comparing the **true** and **predicted** class labels looks encouraging. Let's determine the exact **prediction accuracy** that the trained model $h$ was able to achieve on the evaluation dataset:\n\n#%%\n\nprint('Model classification accuracy: {}%'.format(\n str(metrics.accuracy_score(y_eval, y_pred) * 100)))\n\n\n# Determine the number of **misclassified** data sampels in the evaluation dataset:\n\n#%%\n\nprint('Number of mislabeled points out of a total {} points: {}'.format(\n x_eval.shape[0], np.sum(y_eval != y_pred)))\n\n\n# In the field of machine learning and in particular the field of statistical classification, a **confusion matrix**, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm. Each row of the matrix represents the number of instances that the classifier predicted per class, while each column represents the instances of the true or actual class:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/Confusion_matrix)\n\n# Determine and plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# determine the prediction confusion matrix\nmat = confusion_matrix(y_eval, y_pred)\n\n\n# Plot the **confusion matrix** of the individual predictions:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(5, 5))\n\n# plot confusion matrix heatmap\nsns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='YlOrRd_r',\n xticklabels=iris.target_names, yticklabels=iris.target_names)\n\n# add plot axis labels\nplt.xlabel('[true class label $y_{i}$]')\nplt.ylabel('[predicted class label $y_{i}\\'$]')\n\n# add plot title\nplt.title('SVM Predictions - Confusion Matrix')\n\n\n# #### 1.3.5. Prediction of Classes of Unknown Iris Flower Observations\n\n# **First unknown iris flower:** Now that we have trained and evaluated our SVM classifier let's apply it to two so far unknown or unseen **iris flower** observations. The first **iris flower** observation $x^{s1}$ exhibits the following observed feature values: $x^{s1} = \\{x_{sl}=5.8, x_{sw}=3.5, x_{pl}=1.5, x_{pw}=0.25\\}$:\n\n# \n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's convert those measurements into a feature vector $x^{s1}$:\n\n#%%\n\n# init features of the first unknown iris flower observation\nsepal_length = 5.8\nsepal_width = 3.5\npetal_length = 1.5\npetal_width = 0.25\n\n# create the observation feature vector\nx_s1_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s1_feature_vector)\n\n\n# Let's now use our trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s1}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_1 = svm.predict([x_s1_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_1[0]])\n\n\n# Let's build an intuition of the distinct iris flower class distributions including the current iris flower observation:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset('iris')\n\n# add preliminary label to unknown feature observation\nx_s1_feature_vector.append('observation s1')\n\n# add observation to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [x_s1_feature_vector], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **setosa**.\n\n# **Second unknown iris flower:** Let's apply the learned SVM model to a second unknown or unseen **iris flower** observations. The second **iris flower** observation $x^{s2}$ exhibits the following observed feature values $x^{s2} = \\{x_{1}=7.8, x_{2}=2.3, x_{3}=6.4, x_{4}=2.5\\}$:\n\n# \n#\n#\n# (Source: https://de.wikipedia.org/wiki/Schwertlilien)\n\n# Let's again convert those measurements into a feature vector $x^{s2}$:\n\n#%%\n\n# init features of the second unknown iris flower observation\nsepal_length = 7.8\nsepal_width = 2.3\npetal_length = 6.4\npetal_width = 2.5\n\n# create the observation feature vector\nx_s2_feature_vector = [sepal_length, sepal_width, petal_length, petal_width]\n\n# print the feature vector\nprint(x_s2_feature_vector)\n\n\n# Use the trained SVM model $h$ to predict the class $c^{*}$ of the unknown iris flower $x^{s2}$:\n\n#%%\n\n# determine class label prediction of the first unknown observation\nclass_prediction_sample_2 = svm.predict([x_s2_feature_vector])\n\n# convert predicted class label to class name\nprint(iris.target_names[class_prediction_sample_2[0]])\n\n\n# Ok, does this looks like a reasonable prediction? Let's again try to build an intuition of the prediction derived from the SVM model $h$ based on the distinct iris flower class distributions including $x^{s2}$:\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# add observations to the iris dataset\niris_plot = iris_plot.append(pd.DataFrame(\n [[7.8, 2.3, 6.4, 2.50, \"observation s2\"]], columns=iris_plot.columns))\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# Ok, the feature distributions of the feature values observable for the unknown iris flower $x^{s1}$ exhibit a high likelihood of beeing of class **virginica**.\n\n# #### 1.3.6. Linear Support Vector Machine (SVM) Classifers - The Non-Linear Seperable Case\n\n# Ok, great we have seen how to apply Support Vector classification to separable data. So how can we extend these ideas to handle non-separable data? To achieve this we would like to relax the initial constraints $ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $ and $ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $ when necessary. That is, we would like to introduce a further cost for doing so. This can be done by the introducing of so-called positive **\"slack variables\"** denoted $\\xi_{i}, i=1, ..., l$ in the Lagrange optimization $L_{P}$.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the non-separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Therefore, the initial constraints become:\n\n# $$ x_{i} \\cdot w + b \\geq + 1 - \\xi_{i}, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1 + \\xi_{i}, y_{i} = -1 $$\n#\n# $$ \\xi_{i} \\geq 0, \\forall i$$\n\n# Thus, for an error to occur, the corresponding $\\xi_{i}$ must exceed unity. As a result, $\\sum_{i=1}^{l} \\xi_{i}$ defines an upper bound on the number of training errors.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# A natural way to assign such an extra cost for errors is to add it to the primal Lagrangian objective function $L_{P}$ to be optimized. The Lagrangian therefore becomes:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} + C \\sum_{i=1}^{l} \\xi_{i} - \\sum_{i=1}^{l} \\alpha_{i}\\{y_{i}(x_{i} \\cdot w + b) -1 + \\xi_{i}\\} + \\sum_{i=1}^{l}\\alpha_{i} - \\sum_{i=1}^{l} \\mu_{i} \\xi_{i} $$\n\n# where $C$ is a parameter determines the penalty magnitude of errors. Furthermore, $\\mu_{i}$ are another set of Lagrange multipliers introduced to enforce positivity of the slack variables $\\xi_{i}$. We must now minimize $L_{P}$ with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the minimization of the second term $C \\sum_{i=1}^{l} \\xi_{i}$ minimizes the penalty of misclassfied training samples,\n# > 3. the maximization of the third term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 4. the minimization of the fourth term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors,\n# > 5. the maximization of the fifth term $\\sum_{i=1}^{l} \\mu_{i} \\xi_{i}$ enforces the positivity of the slack variables.\n\n# In general, the penalty term $C$ is a parameter to be chosen by the user. A larger $C$ corresponds to assigning a higher penalty to errors.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# We can again derive a dual formulation of the optimization objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the optimal hyperplane case is that the $\\alpha_{i}$ now have an upper bound of C. Again, the optimal seperating hyperplane $H^{*}$ still remains a linear function of the training data.\n\n# #### 1.3.7. Training of a Support Vector Machine (SVM) Classifier Using Different C Parameterizations\n\n# Let's inspect different parametrizations of $C$ and their corresponding impact on the determined support vectors and learned optimal separating hyperplane $H^{*}$. We can obtain the learned support vectors from the model using the `support_vectors_` method available `Scikit-Learn`. Let's again fit a linear SVM to the training data observations $x_{i}$ using a penalty of $C=1$:\n\n#%%\n\n# init the Support Vector Machine classifier\nsvm = SVC(kernel='linear', C=1, random_state=random_seed)\n\n\n# We will train the SVM model on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to seperate flowers of the classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_test = x_train[y_train != 0, :][:, [0, 2]]\ny_train_test = y_train[y_train != 0]\n\n\n# Let's fit the linear SVM model:\n\n#%%\n\nsvm.fit(x_train_test, y_train_test)\n\n\n# Let's briefly glance over the determined support vectors for which $\\alpha_{i} > 0$ and that constitute the learned max-margin separating hyperplane $H^{*}$:\n\n#%%\n\nsvm.support_vectors_\n\n\n# Finally, let's visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM. Remember, the learned hyperplane was optimized to seperate the features sepal length $x_1$ and petal length $x_3$ of the iris flower classes $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\n# init the plot\nfig = plt.figure(figsize=(6, 6))\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot sepal length vs. petal length and corresponding classes\nax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n# highlight the determined support vectors in green\nax.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n# determine axis ranges\nax = plt.gca()\nxlim = ax.get_xlim()\nylim = ax.get_ylim()\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 30)\nyy = np.linspace(ylim[0], ylim[1], 30)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# determine and plot decision boundary\nZ = svm.decision_function(xy).reshape(XX.shape)\nax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n# add axis legends\nax.set_xlabel(\"[sepal_length]\", fontsize=14)\nax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n# add plot title\nplt.title('Sepal Length vs. Petal Length - Decision Boundary', fontsize=14)\n\n\n# Ok, we can observe how the learned 24 support vectors nicely constitute the optimal maximum margin separating hyperplane $H^{*}$. Let's now investigate how different values of $C \\in \\{0.1, 10, 100, 1000\\}$ will penalize and therefore affect the number of support vectors. Remember, a larger value of $C$ corresponds to assigning a higher penalty to errors:\n\n#%%\n\n# init distinct C values\nC_values = [0.1, 1, 10, 100]\n\n# init SVM models of distinct C values\nsvm_models = (SVC(kernel='linear', C=C, random_state=random_seed)\n for C in C_values)\n\n\n# Let's fit the linear SVM models using distinct values of the penalty term $C$:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_test, y_train_test) for model in svm_models)\n\n\n# Let's now again visually inspect the maximum margin separating hyperplane $H^{*}$ that was learned by our SVM and applying different values of $C$:\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(2, 2, figsize=(14, 14))\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n # add grid\n ax.grid(linestyle='dotted')\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # determine and plot decision boundary\n Z = model.decision_function(xy).reshape(XX.shape)\n ax.contour(XX, YY, Z, colors='k',\n levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])\n\n # add axis legends\n ax.set_xlabel(\"[sepal_length]\", fontsize=14)\n ax.set_ylabel(\"[petal_length]\", fontsize=14)\n\n # add plot title\n ax.set_title('Decision Boundary, C={}, kernel=\\'{}\\''.format(\n str(model.C), str(model.kernel)), fontsize=14)\n\n\n# We can indeed observe that with increasing $C$ the number of misclassifications as well as the number of support vectors that constitute $H^{*}$ decreases.\n\n# #### 1.3.8. Non-Linear Support Vector Machine (SVM) Classifiers\n\n# How can the above linear SVMs be generalised to the case where the optimal separating hyperplane $H^{*}$ can not be formulated as a linear function of the data? This holds for instances when the training data is not linearly separable. Boser, Guyon and Vapnik [7] showed the so-called **\"kernel trick\"** (introduced by Aizermann[8]) could be used to accomplish this in a surprisingly straightforward way. First notice again, from the training objectives dual formulation, that the only way in which the data appears in the objective is in the form of dot products $$. Now suppose we first mapped the data to some other (possibly infinite-dimensional) Euclidean space $\\mathcal{H}$, using the mapping which we will call $\\phi$:\n\n# $$\\phi: \\mathcal{R}^{d} \\mapsto \\mathcal{H}$$\n\n# Then, of course, the training algorithm would only depend on the data through dot products in $\\mathcal{H}$, i.e. on functions of the form $\\phi(x_{i}) \\cdot \\phi(x_{j})$. Now if there were a **\"kernel function\"** $K$ such that $K(x_{i}, x_{j}) = \\phi(x_{i}) \\cdot \\phi(x_{j})$, we would only need to use $K$ in the training algorithm, and would never need to explicitly even know what $\\phi$ is. One such kernel function is:\n\n# $$K(x_{i}, x_{j}) = e^{-||x_{i}-x_{j}||^{2} / 2 \\sigma^{2}} $$\n\n# In this particular example, $\\mathcal{H}$ is infinite-dimensional, so it would not be very easy to work with $\\phi$ explicitly. However, if one replaces $x_{i} \\cdot x_{j}$ by $K(x_{i}, x_{j})$ everywhere in the training procedure, the algorithm will happily produce a SVM which lives in an infinite-dimensional space. All considerations of the previous sections still hold, since we are still doing a linear separation but in a different space. Since we can again derive a dual formulation of the optimisation objective using the conditions that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$, which becomes:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}K(x_{i}, x_{j})$$\n\n# subject to $0 \\leq \\alpha_{i} \\leq C$. The only difference in comparison to the linear hyperplane case is that the dot product $$ is now replaced by a kernel function $K(x_{i}, x_{j})$.\n\n# #### 1.3.9. Training of a Support Vector Machine (SVM) Classifier Using Different Kernel Functions\n\n# Let's now train a set of non-linear SVMs and evaluate different kernel functions $K(x_{i}, x_{j})$. We will again train the distinct SVM models on the sepal length $x_1$ and petal length $x_3$ features of the iris flower dataset to separate the distinct flower classes $c_{0}=$ setosa, $c_{1}=$ versicolor and $c_{2}=$ virginica:\n\n#%%\n\nx_train_kernel = x_train[:, [0, 2]]\ny_train_kernel = y_train\n\n\n# Next, we will instantiate several SVM models each equipped with a different kernel function. Thereby, we will use three of the kernel functions already available in the `Scikit-Learn` library:\n\n# > 1. linear kernel function: **$$**,\n# > 2. radial-basis kernel-function: $exp({- \\gamma ||x_{i}, x_{j}||^{2}})$, where $\\gamma$ is specified by the keyword `gamma` and must be greater than 0,\n# > 3. polynomial kernel-function: $(\\gamma + r)^{d}$, where $d$ is specified by the keyword `degree` and $r$ by `coef0`.\n\n# Let's instantiate the distinct SVM models accordingly:\n\n#%%\n\n# init the SVM models using distinct kernel functions\nsvm_models = (SVC(kernel='linear', C=1), SVC(kernel='rbf', gamma=0.1, C=1), SVC(kernel='rbf', gamma=0.2, C=1), SVC(kernel='rbf', gamma=0.5, C=1), SVC(kernel='rbf', gamma=0.7, C=1), SVC(\n kernel='poly', degree=1, coef0=1.0, C=1), SVC(kernel='poly', degree=2, coef0=1.0, C=1), SVC(kernel='poly', degree=5, coef0=1.0, C=1), SVC(kernel='poly', degree=7, coef0=1.0, C=1))\n\n\n# Let's subsequently train the distinct SVM models:\n\n#%%\n\n# fit the distinct SVM models to the data\nsvm_models = (model.fit(x_train_kernel, y_train_kernel)\n for model in svm_models)\n\n\n# Let's visually inspect the optimal separating hyperplane $H^{*}$ learned by the distinct kernel functions $K(x_{i}, x_{j})$ to separate the sepal length $x_1$ and petal length $x_3$ features :\n\n#%%\n\n# init the plot\nfig, sub = plt.subplots(3, 3, figsize=(14, 14))\n\n# determine mesh-grid limitations\nxlim = [np.min(x_train[:, 0]) - 0.8, np.max(x_train[:, 0]) + 0.8]\nylim = [np.min(x_train[:, 2]) - 0.8, np.max(x_train[:, 2]) + 0.8]\n\n# create meshgrid to evaluate model\nxx = np.linspace(xlim[0], xlim[1], 1000)\nyy = np.linspace(ylim[0], ylim[1], 1000)\nYY, XX = np.meshgrid(yy, xx)\nxy = np.vstack([XX.ravel(), YY.ravel()]).T\n\n# iterate over distinct models\nfor model, ax in zip(svm_models, sub.flatten()):\n\n print(model)\n\n # add grid\n ax.grid(linestyle='dotted')\n\n Z = model.predict(xy).reshape(XX.shape)\n ax.contourf(XX, YY, Z, alpha=0.5, cmap=plt.cm.coolwarm)\n\n # plot sepal length vs. petal length and corresponding classes\n ax.scatter(x_train[:, 0], x_train[:, 2], c=y_train, cmap=plt.cm.Set1)\n\n # highlight the determined support vectors in green\n ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[\n :, 1], s=200, linewidth=1, facecolor='none', edgecolors='k', label='support vectors')\n\n # set axis ranges\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n\n # add axis legends\n ax.set_xlabel('[sepal_length]', fontsize=10)\n ax.set_ylabel('[petal_length]', fontsize=10)\n\n # add plot title\n ax.set_title('C={}, kernel=\\'{}\\', degree=\\'{}\\', gamma=\\'{}\\''.format(str(\n model.C), str(model.kernel), str(model.degree), str(model.gamma)), fontsize=10)\n\n\n# ## 2. History of Oriented Gradients (HOG) Feature Extraction and Classification\n\n# ### 2.1. Dataset Download and Data Assessment\n\n# The **MNIST database** (**M**odified **N**ational **I**nstitute of **S**tandards and **T**echnology database) is a large database of handwritten digits that is commonly used for training various image processing systems. The database is widely used for training and testing in the field of machine learning. Let's have a brief look into a couple of sample images contained in the dataset:\n\n# \n#\n# (Source: https://en.wikipedia.org/wiki/MNIST_database)\n\n# Further details on the dataset can be obtained via: *LeCun, Y., 1998. \"The MNIST database of handwritten digits\", ( http://yann.lecun.com/exdb/mnist/ ).\"*\n\n# The MNIST database contains **60,000 training images** and **10,000 evaluation images**. The size of each image is 28 by 28 pixels. The handwritten digits contained in each fixe-sized image have been size-normalized and centred. The MNIST dataset is a great dataset to start with when learning about machine learning techniques and pattern recognition methods on real-world data. It requires minimal efforts on preprocessing and formatting the distinct images.\n\n# #### 2.1.1. Training Dataset Download and Data Assessment\n\n# Let's download, transform and inspect the training images of the dataset. Therefore, let's first define the directory in which we aim to store the training data:\n\n#%%\n\ntrain_path = './data/train_mnist'\n\n\n# Now, let's download the training data accordingly:\n\n#%%\n\n# download and transform training images\nmnist_train_data = torchvision.datasets.MNIST(\n root=train_path, train=True, download=True)\n\n\n# Convert the downloaded images to `Numpy` arrays:\n\n#%%\n\n# convert images and labels to numpy array\nmnist_train_data_images = mnist_train_data.data.numpy()\nmnist_train_data_labels = mnist_train_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of training images downloaded:\n\n#%%\n\n# determine the number of training data images\nmnist_train_data_images.shape\n\n\n# Verify the number and dimensionality of training labels downloaded:\n\n#%%\n\nmnist_train_data_labels.shape\n\n\n# Furthermore, let's visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_train_image = mnist_train_data_images[image_id, :, :]\nmnist_train_label = mnist_train_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_train_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_train_image, cmap='gray')\n\n\n# #### 2.1.2. Evaluation Dataset Download and Data Assessment\n\n# Let's now also download, transform and inspect the evaluation images of the dataset:\n\n#%%\n\n# set directory of evaluation images\neval_path = './data/eval_mnist'\n\n# download and transform evaluation images\nmnist_eval_data = torchvision.datasets.MNIST(\n root=eval_path, train=False, download=True)\n\n# convert images and labels to numpy array\nmnist_eval_data_images = mnist_eval_data.data.numpy()\nmnist_eval_data_labels = mnist_eval_data.targets.data.numpy()\n\n\n# Verify the number and dimensionality of evaluation images downloaded:\n\n#%%\n\n# determine the number of evaluation data images\nmnist_eval_data_images.shape\n\n\n# Verify the number and dimensionality of evaluation labels downloaded:\n\n#%%\n\nmnist_eval_data_labels.shape\n\n\n# Let's again visually inspect a randomly sampled training image:\n\n#%%\n\n# set image id\nimage_id = 1000\n\n# obtain image\nmnist_eval_image = mnist_eval_data_images[image_id, :, :]\nmnist_eval_label = mnist_eval_data_labels[image_id]\n\n# set image plot title\nplt.title('Example: {}, Label: {}'.format(\n str(image_id), str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(mnist_eval_image, cmap='gray')\n\n\n# ### 2.2. History of Oriented Gradients (HOG) Feature Extraction\n\n# The **\"Histogram of Oriented Gradients (HOG)\"** is a feature descriptor used in computer vision and image processing originally developed for the purpose of object detection. The technique counts occurrences of gradient orientation in localised portions of an image. Its usage became widespread in 2005 when Navneet Dalal and Bill Triggs, researchers for the French National Institute for Research in Computer Science and Automation (INRIA), presented their supplementary work on HOG descriptors at the Conference on Computer Vision and Pattern Recognition (CVPR) [9].\n\n# #### 2.2.1. Extraction of Image Patches\n\n# In the first step of the HOG feature extraction, the images are divided into tiny **\"patches\"**, each consisting of N\u00d7N pixels. In general, the patch size is a design choice informed by the scale of features we are looking for and task we aim to accomplish. To classify the 28x28 MNIST handwritten digit images presented above, we will use patches of size 7x7 pixels, which will nicely divide each image into 4x4=16 image patches. The extraction of such a single 7x7 image patch is shown below:\n\n# \n\n# #### 2.2.2. Calculation of Image Patch Gradients\n\n# Next, in order to determine the distinct values of the HOG features, we calculate the horizontal and vertical gradients of each image patch. This can be achieved by filtering each patch using the two kernels or **\"filter masks\"** as shown below. Thereby, we will obtain for each filter mask, a corresponding **\"gradient map\"** that records the intensity of pixel value change in the particular direction of the filter mask. As a result, the gradient maps remove a lot of non-discriminative information ( e.g., image regions that exhibit a constant colour intensity ), but highlighted regions of high color intensity changes.\n\n# \n\n# Let's have look at the image gradients obtainable for the horizonal filter-mask or kernel $k_{x}=[-1, 0, 1]$ in the x-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_x = np.array([[-1, 0, 1]])\n\n# determine the horizontal image gradients\ng_x = sp.signal.convolve2d(mnist_eval_image, kernel_x)\n\n# set image plot title\nplt.title('Gradients x-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_x, cmap='gray')\n\n\n# Let's have look at the image gradients obtainable for the vertical filter-mask or kernel $k_{y}=[-1, 0, 1]^{T}$ in the y-direction of the $1000^{th}$ sample image contained in the evaluation dataset. Thereby, dark pixel values correspond to high negative gradient value and light pixel values to high positive gradient values (prior to the determination of the gradients absolute value):\n\n#%%\n\n# define the filter masks\nkernel_y = np.array([[-1, 0, 1]]).T\n\n# determine the vertical image gradients\ng_y = sp.signal.convolve2d(mnist_eval_image, kernel_y)\n\n# set image plot title\nplt.title('Gradients y-Direction, Example: {}, Label: {}'.format(str(image_id),\n str(mnist_eval_label)))\n\n# plot mnist handwritten digit sample\nplt.imshow(g_y, cmap='gray')\n\n\n# #### 2.2.3. Calculation of Gradient Magnitude and Orientation\n\n# Once the gradients in (1) horizontal or x-direction and (2) vertical or y-direction is obtained for each pixel the information is consolidated to derive a more general information about the pixel intensity changes within an image. This is accomplished by the derivation of two important gradient attributes, namely:\n#\n# >- the **\"magnitude\"** of the gradients given be the gradients L2-norm: $\\sqrt{g_{x}^{2} + g_{y}^{2}}$,\n# >- the **\"orientation\"** of the gradients given by the gradients arctangent: $\\arctan (\\frac{g_{y}}{g_{y}})$.\n#\n# We will derive both attributes for each of the pixel values contained in the distinct image patches. This results in the gradient magnitude and gradient orientation map, as shown below:\n\n# \n\n# #### 2.2.4. Calculation of Histogram of Oriented Gradients (HOG)\n\n# As a last step, we will determine the HOG descriptors based on the gradient magnitude and the gradient orientation map. To achieve this, we will compute the histogram of the gradient orientations binned into $b_{n}, n=1,...,9$ bins. Thereby, the distinct bins correspond to equidistant intervalls of possible gradient orientations, e.g. $b_{1}=[0\u00b0, 19\u00b0], b_{2}=[20\u00b0, 39\u00b0], b_{3}=[40\u00b0, 59\u00b0], ..., b_{9}=[160\u00b0, 179\u00b0].$\n#\n# For each pixel of the image patch, the corresponding bin is selected based on its gradient orientation, and the vote ( the value that goes into the bin ) is selected based on the normalized gradient magnitude, according to:\n\n# $$b_{d} = \\frac{|b_{d} - d|}{b_{d}} \\times m = \\frac{|20 - 39|}{20} \\times 297 = 282.15$$\n#\n# $$b_{d+1} = \\frac{|b_{d+1} - d|}{b_{d}} \\times m = \\frac{|40 - 39|}{20} \\times 297 = 14.85$$\n\n# \n\n# Once all the values of the gradient maps have been collected to obtained histogram of gradients is normalized. This normalization is usually done by calculating the $L2-Norm$ over the distinct bin values, as shown in the following:\n#\n# $$||h||_{2} = \\sqrt{b_{1}^{2} + b_{2}^{2} + ... + b_{n}^{2}} = \\sqrt{420^2 + 1110^2 + ... + 787^2} = 2312.9$$\n#\n# and normalize the distinct bins accordingly to obtain the HOG feature vector of a particular image patch:\n#\n# $$ x_{i} = [\\frac{420}{2312.9}, \\frac{1110}{2312.9}, ..., \\frac{787}{2312.9}] = [0.18, 0.47, 0.28, ..., 0.34]$$\n#\n# where $i$ denotes the current of the N=16 image patches. Ultimately, all the HOG feature vectors obtained for the 16 distinct image patches are concatenated into a single HOG combined feature vector of an image.\n#\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all training images\n\n# init list of hog features\nmnist_train_data_hog_features = []\nmnist_train_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_train_image in enumerate(mnist_train_data_images):\n\n # extract hog features of current training image\n train_features, train_image = hog(mnist_train_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_train_data_hog_features.append(train_features)\n mnist_train_data_hog_images.append(train_image)\n\n # case: print image processing status\n if i % 10000 == 0:\n\n # print log message\n print('[LOG] {} features of training image {} succesfully extracted.'.format(\n str(len(train_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the training data:\n\n#%%\n\nlen(mnist_train_data_hog_features)\n\n\n# Inspect a single feature vector:\n\n#%%\n\nmnist_train_data_hog_features[1000]\n\n\n# Inspect the number of features extracted for each MNIST digit image:\n\n#%%\n\nlen(mnist_train_data_hog_features[1000])\n\n\n# Ok, we extracted HOG features for 4 orientations from each image consisting of 16 (4x4) patches of 7x7 pixels each. This results on total length of 64 extracted features per image (16 patches x 4 orientations).\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the training dataset:\n\n#%%\n\nplt.imshow(mnist_train_data_hog_images[1000], cmap='gray')\n\n\n# Let's calculate the HOG feature descriptors for the MNIST images of the training dataset:\n\n#%%\n\n# extract the hog features of all evaluation images\n\n# init list of hog features\nmnist_eval_data_hog_features = []\nmnist_eval_data_hog_images = []\n\n# iterate over all training images\nfor i, mnist_eval_image in enumerate(mnist_eval_data_images):\n\n # extract hog features of current evluation image\n eval_features, eval_image = hog(mnist_eval_image, orientations=4, pixels_per_cell=(\n 7, 7), cells_per_block=(1, 1), visualize=True)\n\n # collect extracted hog features\n mnist_eval_data_hog_features.append(eval_features)\n mnist_eval_data_hog_images.append(eval_image)\n\n # case: print image processing status\n if i % 1000 == 0:\n\n # print log message\n print('[LOG] {} features of evaluation image {} succesfully extracted.'.format(\n str(len(eval_features)), str(i).zfill(5)))\n\n\n# Inspect the completeness of the generated feature vectors derived from the evaluation data:\n\n#%%\n\nlen(mnist_eval_data_hog_features)\n\n\n# Let's also visualise the HOG features of an exemplary MNIST digit image of the evaluation dataset:\n\n#%%\n\nplt.imshow(mnist_eval_data_hog_images[1000], cmap='gray')\n\n\n# ### 2.3. History of Oriented Gradients (HOG) Feature Classification\n\n# #### 2.3.1. Training of the Support Vector Machine Classifier\n\n# Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane based on the extracted History of Oriented Gradients (HOG) features:\n\n#%%", "original_comment": "# init the Support Vector Machine classifier\n", "target_code": "svm = SVC(kernel='linear', C=1, random_state=random_seed)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "svc_hog_lin = svm.SVC(kernel='linear', C=1)\n# fit the classifier to the training data\nsvc_hog_lin.fit(mnist_train_data_hog_features, mnist_train_labels)\n", "model": "natural", "intent": "# init the Support Vector Machine classifier"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n\nimport math\nx = 1 + 1\nx\n\n\ny = 1.0 * 2\ny\n\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n\n# Classic division\n3 / 5\n\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n\n5 % 2 # Remainder of number 1 / number 2\n\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n\nx = 1\ny = 1.0\nz = 2\n\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n#%%\n\nimport math\nx = 1 + 1\nx\n\n#%%\n\ny = 1.0 * 2\ny\n\n#%%\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n#%%\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n#%%\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n#%%\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n#%%\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n#%%\n\n# Classic division\n3 / 5\n\n#%%\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n#%%\n\n5 % 2 # Remainder of number 1 / number 2\n\n#%%\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n#%%\n\nx = 1\ny = 1.0\nz = 2\n\n#%%\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n#%%\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n#%%\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)", "original_comment": "# Check if 0 is in x\n", "target_code": "print(0 in x)\n", "project_metadata": {"full_name": "jlaura/GIS321", "description": null, "topics": [], "git_url": "git://github.com/jlaura/GIS321.git", "stars": 5, "watchers": 5, "forks": 15, "created": "2016-01-11T03:36:14Z", "size": 3772, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 250997}, "last_updated": "2017-04-06T06:32:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "x = [1, 2, 3, 4, 5]\nprint(1 in x)\n", "model": "docstring", "intent": "# Check if 0 is in x"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n\nimport math\nx = 1 + 1\nx\n\n\ny = 1.0 * 2\ny\n\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n\n# Classic division\n3 / 5\n\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n\n5 % 2 # Remainder of number 1 / number 2\n\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n\nx = 1\ny = 1.0\nz = 2\n\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n#%%\n\nimport math\nx = 1 + 1\nx\n\n#%%\n\ny = 1.0 * 2\ny\n\n#%%\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n#%%\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n#%%\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n#%%\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n#%%\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n#%%\n\n# Classic division\n3 / 5\n\n#%%\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n#%%\n\n5 % 2 # Remainder of number 1 / number 2\n\n#%%\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n#%%\n\nx = 1\ny = 1.0\nz = 2\n\n#%%\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n#%%\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n#%%\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)", "original_comment": "# Check if 0 is in x\n", "target_code": "print(0 in x)\n", "project_metadata": {"full_name": "jlaura/GIS321", "description": null, "topics": [], "git_url": "git://github.com/jlaura/GIS321.git", "stars": 5, "watchers": 5, "forks": 15, "created": "2016-01-11T03:36:14Z", "size": 3772, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 250997}, "last_updated": "2017-04-06T06:32:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "x = [1, 2, 3, 4, 5]\nprint(0 in x)\n", "model": "natural", "intent": "# Check if 0 is in x"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n#%%\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n#%%\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n#%%\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n#%%\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n#%%\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n#%%\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n#%%\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n#%%\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n#%%\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n#%%\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n#%%\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n#%%", "original_comment": "# Reload with new savename\n", "target_code": "learn = create_learner(data, 'stage2-bestmodel')\nlearn.load('stage1-bestmodel')\n", "project_metadata": {"full_name": "URBNOpenSource/custom-vision-study", "description": null, "topics": [], "git_url": "git://github.com/URBNOpenSource/custom-vision-study.git", "stars": 5, "watchers": 5, "forks": 4, "created": "2019-03-12T20:31:02Z", "size": 19785, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5009642, "Python": 5509, "Shell": 928}, "last_updated": "2019-10-24T13:27:26Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "learn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n", "model": "no-comments", "intent": "# Reload with new savename"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n\n# print the type of an object\ntype(df)\n\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n\n# list the index of the DataFrame\ndf.index\n\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n\n# print first 5 rows\ny.head()\n\n\n# print type of the ts object\ntype(y)\n\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # 1. Data\n#\n# Every problem starts with data....\n#\n# Obviously there are a multitude of data sets out there. Thus, this tutorial will make use of two data sets: the stylized \"International Airline Passengers\" data set as well as a self generated data set of stock prices download from Yahoo! Finance.\n\n#%%\n\n# %load_ext autoreload\n# %autoreload 2\nimport math\nfrom IPython.display import set_matplotlib_formats, Image\nfrom ipywidgets import interactive, widgets, RadioButtons, ToggleButtons, Select, FloatSlider, FloatProgress\nimport seaborn as sns\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport warnings\nimport numpy as np\nimport pandas_datareader.data as web\nimport pandas as pd\nimport itertools\nimport datetime\nimport sys\nimport os\nimport re\nget_ipython().run_line_magic('matplotlib', 'inline')\nget_ipython().run_line_magic('config', \"InlineBackend.figure_format = 'retina'\")\n\n\n# basic functionalities\n\n\n# data transforamtion and manipulation\n# prevent crazy long pandas prints\npd.options.display.max_columns = 16\npd.options.display.max_rows = 16\npd.set_option('display.float_format', lambda x: '%.5f' % x)\nnp.set_printoptions(precision=5, suppress=True)\n\n\n# remove warnings\nwarnings.filterwarnings('ignore')\n\n\n# plotting and plot stying\nplt.style.use('seaborn')\n#sns.set_style(\"whitegrid\", {'axes.grid' : False})\n#set_matplotlib_formats('pdf', 'png')\nplt.rcParams['savefig.dpi'] = 80\nplt.rcParams['figure.autolayout'] = False\nplt.rcParams['figure.figsize'] = (16, 8)\nplt.rcParams['axes.labelsize'] = 16\nplt.rcParams['axes.labelweight'] = 'bold'\nplt.rcParams['axes.titlesize'] = 20\nplt.rcParams['axes.titleweight'] = 'bold'\nplt.rcParams['font.size'] = 16\nplt.rcParams['lines.linewidth'] = 2.0\nplt.rcParams['lines.markersize'] = 8\nplt.rcParams['legend.fontsize'] = 14\nplt.rcParams['text.usetex'] = False\n#plt.rcParams['font.family'] = \"serif\"\nplt.rcParams['font.serif'] = \"cm\"\nplt.rcParams['text.latex.preamble'] = b\"\\usepackage{subdepth}, \\usepackage{type1cm}\"\n\n\n# jupyter wdgets\n\n\n# ## 1.1 Working with Data (Structures)\n#\n# The pandas module allows to introduce data management to data. Using the read_csv method the data is strung into a DataFrame object which allows to directly access the data and providing the means for data analysis and transformation. A dataframe is a collection of Series objects.\n#\n# A much better formatting of the data can be established if additional arguments are specified when importing the data set, for example:\n#
    \n#
  • header: include the first row as the header for the DataFrame object
  • \n#
  • index_col: set the index column of the DataFrame to the first column of the data set ('month')
  • \n#
  • parse_dates: automatically parse dates which will index the DataFrame
  • \n#
  • sep: specify the symbol which seperates the values and strings in the data set - in this case it is semi-colon
  • \n#
\n\n#%%\n\n# load passenger data set and safe to DataFrame\ndf = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\n\n#%%\n\n# print the first 5 rows of the DataFrame\ndf.tail()\n\n#%%\n\n# print the type of an object\ntype(df)\n\n#%%\n\n# print the data type of all columns except for the index column\ndf.dtypes\n\n#%%\n\n# print summary statistics\ndf.describe()\n\n\n# Indexes are important in the domain of pandas data transformations and the applicability of prebuilt analytics methods from other libraries.\n\n#%%\n\n# list the index of the DataFrame\ndf.index\n\n#%%\n\n# save the n_passenger column to a new variable, this becomes a Series object\ny = df[\"n_passengers\"]\n\n# or\ny = df.n_passengers\n\n#%%\n\n# print first 5 rows\ny.head()\n\n#%%\n\n# print type of the ts object\ntype(y)\n\n#%%\n\n# select all entries from the year '1950'\ny['1952-01']\n\n\n# ## 1.2 Download Historical Data\n#\n# This is a sample workflow of how to retrieve data directly from the web using the pandas_datareader module to access stock price data from the Yahoo! Finance or Google Finance APIs. Yahoo! Finance is up to now the most popular service, and thus it will also be used here.\n\n#%%\n\n# define arguments\ntickers = ['AAPL', 'AMZN', 'FB', 'GOOG', 'NFLX', '^GSPC']\nprovider = 'yahoo'\nstart = datetime.datetime(2012, 5, 18)\nend = datetime.date(2017, 8, 22)\n\n#%%\n\n# ask Yahoo! Finance for data\npanel = web.DataReader(tickers, provider, start, end)\n\n#%%\n\nprint(panel['Adj Close'])\n\n\n# Without further specification, this command returns a Panel object. A panel can be thought of as a collection of DateFrame object.\n#\n# Additional data is also provided by this service, such as corporate actions like stock split or dividend payments\n\n#%%", "original_comment": "# ask Yahoo! Finance for data\n", "target_code": "corporate_actions = web.DataReader(tickers, 'yahoo-actions', start, end)\n", "project_metadata": {"full_name": "dacatay/time-series-analysis", "description": "Presentation for time series analysis", "topics": [], "git_url": "git://github.com/dacatay/time-series-analysis.git", "stars": 41, "watchers": 41, "forks": 53, "created": "2017-09-08T13:45:56Z", "size": 43990, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12370243, "R": 4829}, "last_updated": "2020-11-05T10:34:15Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "df = pd.read_csv('./data/passengers.csv', header=0,\n index_col=0, parse_dates=True, sep=';')\ndf.tail()\n", "model": "no-comments", "intent": "# ask Yahoo! Finance for data"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 13 - \"Autoencoder Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Autoencoder Neural Networks (AENN)**.\n#\n# Unlike standard feedforward neural networks, AENN's learn how to **encode** the input data into a low dimensional representation. At the same time, the AENN learns how to **decode** the original data back from the encoded representation. The decoded data usually referred to as \"reconstruction\", should match the original input as closely as possible.\n#\n# We will again use the functionality of the `PyTorch` library to implement and train an autoencoder neural network. The network will be trained to learn the characteristics of historical **accounting data**, usually referred to as \"journal entries.\" Once the model is trained, we will apply it to detect anomalous journal entries contained in the dataset. Finally, we will inspect the low-dimensional representations of each journal entry to interpret the detection results.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# >1. Understand the **basic concepts, intuitions and major building blocks** of autoencoder neural networks.\n# >2. **Pre-process** categorical financial data to learn a model of its characteristics and pattern.\n# >3. Apply autoencoder neural networks to **detect anomalies** in large-scale financial data.\n# >4. **Interpret the detection results** of the network as well as its reconstruction loss.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab, post them in our NextThought lab discussion forum (https://financial-data-science.nextthought.io), or send us an email (using our fds.ai email addresses).\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport io\nimport urllib\nimport itertools\nimport sys\nimport os\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom importlib import reload\nfrom google.colab import drive\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nfrom IPython.display import YouTubeVideo\n# GitHub Arctic Code Vault\n# YouTubeVideo('fzI9FNjXQ0o', width=800, height=400)\n\n\n# ### Motivation\n\n# The Association of Certified Fraud Examiners estimates in its Global Fraud Study 2016 [1] that the typical organization loses 5% of its annual revenues due to fraud. According to Joseph T. Wells [2] the term **\"fraud\"** refers to, **\"the abuse of one's occupation for personal enrichment through the deliberate misuse of an organization's resources or assets\"**.\n#\n# A similar more recent study, conducted by the auditors of PwC, revealed that 30% of the study respondents experienced losses of between USD 100,000 and USD 5 million [3] in the last 24 months. The study also showed that financial statement fraud caused by far the greatest median loss of the surveyed fraud schemes.\n\n# ### Classification of Financial Anomalies\n\n# When conducting a detailed examination of real-world journal entries, usually recorded in large-scaled Accounting Information Systems (AIS) or Enterprise Ressource Planning (ERP) systems, two common characteristics can be observed:\n#\n# > - specific transactions attributes exhibit **a wide variety of distinct attribute values**, e.g., customer information, posted sub-ledgers, amount data, and\n# > - the transactions exhibit **strong dependencies between specific attribute values** e.g., between customer information and type of payment, posting type, and general ledgers.\n#\n# Derived from this observation we distinguish two classes of anomalous journal entries, namely **\"global\"** and **\"local\" anomalies** as illustrated in **Figure 1** below:\n\n# \n\n# **Figure 1:** Illustrative example of global and local anomalies portrait in a feature space of the two transaction features \"Posting Amount\" (Feature 1) and \"Posting Positions\" (Feature 2).\n\n# ***Global Anomalies***, are financial transactions that exhibit **unusual or rare individual attribute values**. These anomalies usually relate to highly skewed attributes, e.g., seldom posting users, rarely used ledgers, or unusual posting times. Traditionally \"red-flag\" tests performed by auditors during annual audits are designed to capture those types of anomalies. However, such tests might result in a high volume of false-positive alerts due to, e.g., regular reverse postings, provisions, and year-end adjustments usually associated with a low fraud risk.\n\n# ***Local Anomalies***, are financial transactions that exhibit an **unusual or rare combination of attribute values** while the individual attribute values occur quite frequently e.g. exceptional accounting records. This type of anomaly is significantly more challenging to detect since perpetrators intend to disguise their activities trying to imitate a normal behavior. As a result, such anomalies usually pose a high fraud risk since they might correspond to, e.g., misused user accounts, irregular combinations of general ledger accounts and posting keys that don't follow a usual activity pattern.\n\n# ### Setup of the Jupyter Notebook Environment\n\n# As a next step, let's import the libraries needed throughout the lab:\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT`, and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Import Google's `GDrive` connector and mount your `GDrive` directories:\n\n#%%\n\n# import the Google Colab GDrive connector\n\n# mount GDrive inside the Colab notebook\ndrive.mount('/content/drive')\n\n\n# Create a structure of Colab Notebook sub-directories inside of `GDrive` to store (1) the data as well as (2) the trained neural network models:\n\n#%%\n\n# create Colab Notebooks directory\nnotebook_directory = '/content/drive/MyDrive/Colab Notebooks'\nif not os.path.exists(notebook_directory):\n os.makedirs(notebook_directory)\n\n# create data sub-directory inside the Colab Notebooks directory\ndata_directory = '/content/drive/MyDrive/Colab Notebooks/data'\nif not os.path.exists(data_directory):\n os.makedirs(data_directory)\n\n# create models sub-directory inside the Colab Notebooks directory\nmodels_directory = '/content/drive/MyDrive/Colab Notebooks/models'\nif not os.path.exists(models_directory):\n os.makedirs(models_directory)\n\n\n# Set a random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# Also, let's display information about the potential GPUs running on the server:\n\n#%%\n\nget_ipython().system('nvidia-smi')\n\n\n# Let's execute the cell below to display information about the `Python` and `PyTorch` version running on this notebook or compute server:\n\n#%%\n\n# print current Python version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The Python version: {}'.format(now, sys.version))\n\n#%%\n\n# print current PyTorch version\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] The PyTorch version: {}'.format(now, torch.__version__))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# Nowadays, organizations accelerate the digitization and reconfiguration of business processes [4] affecting in particular Accounting Information Systems (AIS) or more general Enterprise Resource Planning (ERP) systems.\n#\n# Steadily, these systems collect vast quantities of electronic evidence at an almost atomic level. This observation holds in particular for the journal entries of an organization recorded in its general ledger and sub-ledger accounts. SAP, one of the most prominent ERP software providers, estimates that approx. 76% of the world's transaction revenue touches one of their systems [5].\n#\n# The illustration in **Figure 1** depicts a hierarchical view of an Accounting Information System (AIS) recording process and journal entry information in designated database tables. In the context of fraud examinations, the data collected by such systems may contain valuable traces of a potential fraud scheme.\n\n# \n\n# **Figure 1:** Hierarchical view of an Accounting Information System (AIS) that records distinct layers of abstraction, namely (1) the business process information, (2) the accounting information as well as the (3) technical journal entry information in designated database tables.\n\n# In this section of the lab notebook, we will conduct a descriptive analysis of the lab's financial dataset. Furthermore, we will apply some necessary pre-processing steps to train a deep neural network. The lab is based on a derivation of the **\"Synthetic Financial Dataset For Fraud Detection\"** by Lopez-Rojas [6] available via the Kaggle predictive modeling and analytics competitions platform that can be obtained using the following link: https://www.kaggle.com/ntnu-testimon/paysim1.\n#\n# Let's start loading the dataset and investigate its structure and attributes:\n\n#%%\n\n# load the dataset into the notebook kernel\nurl = 'https://raw.githubusercontent.com/financial-data-science/CFDS-Notebooks/master/lab_13/data/fraud_dataset_v2.csv'\nori_dataset = pd.read_csv(url)\n\n\n# Let's also check the dataset dimensionalities for completeness:\n\n#%%\n\n# inspect the datasets dimensionalities\nnow = dt.datetime.utcnow().strftime(\"%Y.%m.%d-%H:%M:%S\")\nprint('[LOG {}] transactional dataset of {} rows and {} columns retreived.'.format(\n now, ori_dataset.shape[0], ori_dataset.shape[1]))\n\n\n# Ok, looks good. Let's also save the dataset locally to `GDrive`:\n\n#%%\n\nori_dataset.to_excel(os.path.join(data_directory, \"fraud_dataset.xlsx\"))\n\n\n# #### 1.1 Initial Data and Attribute Assessment\n\n# We augmented the dataset and renamed the attributes to mimic a real-world dataset that one usually observes in SAP-ERP systems as part of SAP's Finance and Cost controlling (FICO) module.\n#\n# The dataset contains a subset of in total seven categorical and two numerical attributes available in the FICO BKPF (containing the posted journal entry headers) and BSEG (containing the posted journal entry segments) tables. Please, find below a list of the individual attributes as well as a brief description of their respective semantics:\n#\n# >- `BELNR`: the accounting document number,\n# >- `BUKRS`: the company code,\n# >- `BSCHL`: the posting key,\n# >- `HKONT`: the posted general ledger account,\n# >- `PRCTR`: the posted profit center,\n# >- `WAERS`: the currency key,\n# >- `KTOSL`: the general ledger account key,\n# >- `DMBTR`: the amount in the local currency,\n# >- `WRBTR`: the amount in the document currency.\n#\n# Let's also have a closer look into the top 10 rows of the dataset:\n\n#%%\n\n# inspect top rows of dataset\nori_dataset.head(10)\n\n\n# You may also have noticed the attribute `label` in the data. We will use this field throughout the lab to evaluate the quality of our trained models. The field describes the true nature of each transaction of either being a **regular** transaction (denoted by `regular`) or an **anomaly** (denoted by `global` and `local`). Let's have a closer look into the distribution of the regular vs. anomalous transactions in the dataset:\n\n#%%\n\n# number of anomalies vs. regular transactions\nori_dataset.label.value_counts()\n\n\n# Ok, the statistic reveals that similar to real-world scenarios, we are facing a highly \"unbalanced\" dataset. Overall, the dataset contains only a small fraction of **100 (0.018%)** anomalous transactions. While the 100 anomalous entries encompass **70 (0.013%)** \"global\" anomalies and **30 (0.005%)** \"local\" anomalies as introduced in section 1.2.\n\n#%%\n\n# remove the \"ground-truth\" label information for the following steps of the lab\nlabel = ori_dataset.pop('label')\n\n\n# #### 1.2 Pre-Processing of Categorical Transaction Attributes\n\n# From the initial data assessment above, we can observe that the majority of attributes recorded in AIS- and ERP-systems correspond to categorical (discrete) attribute values, e.g. the posting date, the general ledger account, the posting type, the currency. Let's have a more detailed look into the distribution of two dataset attributes, namely (1) the posting key `BSCHL` as well as (2) the general ledger account `HKONT`:\n\n#%%\n\n# prepare to plot posting key and general ledger account side by side\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot the distribution of the posting key attribute\ng = sns.countplot(x=ori_dataset['BSCHL'], ax=ax[0])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('BSCHL Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Posting Key\\' attribute values', fontsize=20)\n\n# plot the distribution of the general ledger attribute\ng = sns.countplot(x=ori_dataset['HKONT'], ax=ax[1])\n\n# set axis labels\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\ng.set_xlabel('HKONT Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'General Ledger\\' attribute values', fontsize=20)\n\n\n# Unfortunately, neural networks are, in general, not designed to be trained directly on categorical data and require the attributes to be trained on to be numeric. One simple way to meet this requirement is by applying a technique referred to as **\"one-hot\" encoding**. Using this encoding technique, we will derive a numerical representation of each of the categorical attribute values. One-hot encoding creates new binary columns for each categorical attribute value present in the original data.\n#\n# Let's have a look at the example shown in **Figure 2** below. The **categorical attribute \u201cReceiver\u201d** below contains the names \"John,\" \"Timur\" and \"Marco.\" We \"one-hot\" encode the names by creating a separate binary column for each possible name-value observable in the \"Receiver\" column. Now, we encode for each transaction that contains the value \"John\" in the \"Receiver\" column this observation with 1.0 in the newly created \"John\" column and 0.0 in all other generated name columns.\n\n# \n#\n# **Figure 2:** Exemplary one-hot encoding of the distinct `Receiver` attribute values into specific binary (\"one-hot) columns. Thereby, each attribute value observable in the dataset results in a separate column. The column value `1.0` denotes the occurance of the attribute value in the corresponding journal entry. In contrast the column value `0.0` indicates the absence of the attribute value in the corresponding journal entry.\n\n# Using this technique will \"one-hot\" encode the six categorical attributes in the original transactional dataset. This can be achieved using the `get_dummies()` function available in the Pandas data science library:\n\n#%%\n\n# select categorical attributes to be \"one-hot\" encoded\ncategorical_attr_names = ['KTOSL', 'PRCTR', 'BSCHL', 'HKONT']\n\n# encode categorical attributes into a binary one-hot encoded representation\nori_dataset_cat_processed = pd.get_dummies(ori_dataset[categorical_attr_names])\n\n\n# Finally, let's inspect the encoding of 10 sample transactions to see if the encoding was accomplished successfully;\n\n#%%\n\n# inspect encoded sample transactions\nori_dataset_cat_processed.head(10)\n\n\n# #### 1.3 Pre-Processing of Numerical Transaction Attributes\n\n# Let's now inspect the distributions of the two numerical attributes contained in the transactional dataset namely, the (1) local currency amount `DMBTR` and the (2) document currency amount `WRBTR`:\n\n#%%\n\n# plot the log-scaled \"DMBTR\" as well as the \"WRBTR\" attribute value distribution\nfig, ax = plt.subplots(1, 2)\nfig.set_figwidth(20)\n\n# plot distribution of the local amount attribute\ng = sns.distplot(ori_dataset['DMBTR'].tolist(), ax=ax[0])\n\n# set axis labels\ng.set_xlabel('DMBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title('Distribution of the \\'Local Amount\\' attribute values', fontsize=20)\n\n# plot distribution of the document amount attribute\ng = sns.distplot(ori_dataset['WRBTR'].tolist(), ax=ax[1])\n\n# set axis labels\ng.set_xlabel('WRBTR Value', fontsize=18)\ng.set_ylabel('Value Count', fontsize=18)\n\n# set plot title\ng.set_title(\n 'Distribution of the \\'Foreign Amount\\' attribute values', fontsize=20)\n\n\n# As expected, it can be observed that for both attributes, the distributions of amount values are **heavy-tailed**. In order to approach faster a potential global minimum scaling and normalization of numerical input values is good practice. Therefore, we first log-scale both variables and second min-max normalize the scaled amounts to the interval [0, 1].\n\n#%%\n\n# select the 'DMBTR' and 'WRBTR' attribute\nnumeric_attr_names = ['DMBTR', 'WRBTR']\n\n# add a small epsilon to eliminate zero values from data for log scaling\nnumeric_attr = ori_dataset[numeric_attr_names] + 1e-7\n\n# log scale the 'DMBTR' and 'WRBTR' attribute values\nnumeric_attr = numeric_attr.apply(np.log)", "original_comment": "# normalize all numeric attributes to the range [0,1]\n", "target_code": "ori_dataset_num_processed = (\n numeric_attr - numeric_attr.min()) / (numeric_attr.max() - numeric_attr.min())\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "numeric_attr = (numeric_attr - numeric_attr.min()) / \\\n (numeric_attr.max() - numeric_attr.min())\n", "model": "natural", "intent": "# normalize all numeric attributes to the range [0,1]"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n#%%\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing", "original_comment": "# Let's first look at the data frame.\n", "target_code": "rating.head()\n", "project_metadata": {"full_name": "liyenhsu/restaurant-data-with-consumer-ratings", "description": "Build recommender systems for restaurants", "topics": [], "git_url": "git://github.com/liyenhsu/restaurant-data-with-consumer-ratings.git", "stars": 3, "watchers": 3, "forks": 4, "created": "2017-11-09T05:11:58Z", "size": 1373, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1230183}, "last_updated": "2020-10-11T20:40:42Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "reader = Reader(rating_scale=(1, 5))\ndata = Dataset.load_from_df(\n rating[['userId','movieId', 'rating']], reader)\n", "model": "no-comments", "intent": "# look at the data frame"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\nfrom extract_feature import FeatureExtractor\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n#%%\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()", "original_comment": "# ## Extract Feature\n", "target_code": "from extract_feature import FeatureExtractor\n\nFE = FeatureExtractor(model_name='xlm-r')\n", "project_metadata": {"full_name": "ilhamfp/indonesian-text-classification-multilingual", "description": "Improving Indonesian text classification using multilingual language model", "topics": ["multilingual-language-model", "text-classification", "indonesian-language", "indonesian-text-classification", "sentiment-analysis", "hate-speech-detection", "language-model", "multilingual", "zero-shot", "monolingual", "cross-lingual-transfer", "multilingual-language-models", "indonesian-data", "english-language"], "git_url": "git://github.com/ilhamfp/indonesian-text-classification-multilingual.git", "stars": 7, "watchers": 7, "forks": 0, "created": "2020-04-26T07:27:39Z", "size": 15604, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3476215, "Python": 28982}, "last_updated": "2020-12-20T17:12:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "train_pos = train[train['label'] == 1].reset_index(drop=True)\ntrain_neg = train[train['label'] == 0].reset_index(drop=True)\nprint(train_pos.shape)\ntrain_pos.head()\n", "model": "natural", "intent": "# Extract Feature"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n#%%\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n#%%\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n#%%\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n#%%\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n#%%\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n#%%\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n#%%\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n#%%\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n#%%\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n#%%\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n#%%\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n#%%\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n#%%\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n#%%\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n#%%\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n#%%\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n#%%\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n#%%\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n#%%\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n#%%\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n#%%\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n#%%\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')", "original_comment": "# Tweak spacing to prevent clipping of ylabel\n", "target_code": "plt.subplots_adjust(left=0.15)\n", "project_metadata": {"full_name": "kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN", "description": "Predict HDR images from LDR images using CNN", "topics": [], "git_url": "git://github.com/kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2017-07-10T10:31:45Z", "size": 16499, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 96258, "Python": 44059, "MATLAB": 26466, "Shell": 15315, "M": 423}, "last_updated": "2020-07-07T08:49:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "newArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt', 'r')\nhdr_files = fo.readlines()\nhdrARR = []\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmb\n", "model": "no-comments", "intent": "# Tweak spacing to prevent clipping of ylabel"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n\ndf.columns\n\n\ndf.describe()\n\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n\ndf.head()\n\n\ndf.isnull().any()\n\n\n\nvariables = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',\n 'Ticket', 'Fare', 'Embarked', 'Survived', 'Initial']\n# Calculate the correlations\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n#%%\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n#%%\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n#%%\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n#%%\n\ndf.columns\n\n#%%\n\ndf.describe()\n\n#%%\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.isnull().any()", "original_comment": "# # Correlation between variables\n", "target_code": "corr_mat = df[variables].corr().round(2)\n", "project_metadata": {"full_name": "ghimireadarsh/AI-WinterSchool", "description": "Comprises of various lecture slides, papers, practical notebooks used during AI Winter school, organized by NAAMII at Pokhara, Nepal from December 10, 2019 to December 20, 2019. ", "topics": [], "git_url": "git://github.com/ghimireadarsh/AI-WinterSchool.git", "stars": 6, "watchers": 6, "forks": 6, "created": "2019-12-14T18:16:09Z", "size": 75918, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1041087, "HTML": 666537, "Python": 20395}, "last_updated": "2020-09-27T21:32:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "coverage": "Strongly disagree", "coverage-score": 0, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df = df[variables]\ndf.head()\n", "model": "no-comments", "intent": "# Correlation between variables"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n\ntrainingSummary = model.summary\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Problem\n# Congratulations! You've been contracted by Hyundai Heavy Industries to help them build a predictive model for some ships. [Hyundai Heavy Industries](http://www.hyundai.eu/en) is one of the world's largest ship manufacturing companies and builds cruise liners.\n#\n# You've been flown to their headquarters in Ulsan, South Korea to help them give accurate estimates of how many crew members a ship will require.\n#\n# They are currently building new ships for some customers and want you to create a model and use it to predict how many crew members the ships will need.\n#\n# Here is what the data looks like so far:\n#\n# Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n# ships.\n#\n#\n# Variables/Columns\n# Ship Name 1-20\n# Cruise Line 21-40\n# Age (as of 2013) 46-48\n# Tonnage (1000s of tons) 50-56\n# passengers (100s) 58-64\n# Length (100s of feet) 66-72\n# Cabins (100s) 74-80\n# Passenger Density 82-88\n# Crew (100s) 90-96\n#\n# It is saved in a csv file for you called \"cruise_ship_info.csv\". Your job is to create a regression model that will help predict how many crew members will be needed for future ships. The client also mentioned that they have found that particular cruise lines will differ in acceptable crew counts, so it is most likely an important feature to include in your analysis!\n#\n# Once you've created the model and tested it for a quick check on how well you can expect it to perform, make sure you take a look at why it performs so well!\n\n#%%\n\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.sql.functions import corr\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.sql import SparkSession\nimport findspark\nfindspark.init('/home/dangkhoa/spark-2.3.1-bin-hadoop2.7')\n\n\n# ## Session\n\n#%%\n\nspark = SparkSession.builder.appName('Linear_Regression').getOrCreate()\n\n\n# ## Load dataset\n\n#%%\n\nraw_data = spark.read.csv(\"cruise_ship_info.csv\",\n inferSchema=True, header=True)\n\nraw_data.printSchema()\n\n#%%\n\nraw_data.show(5)\n\n\n# ## String Indexer\n\n#%%\n\n# String Indexer\n\nindexer = StringIndexer(\n inputCol=\"Cruise_line\",\n outputCol=\"Cruise_line_Index\")\n\nstring_indexed_data = indexer.fit(raw_data).transform(raw_data)\nstring_indexed_data.show(5)\n\n\n# ## Data Exploratory\n\n#%%\n\nstring_indexed_data.groupBy('Cruise_line').count().show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'Cruise_line_Index'),\n corr('crew', 'Age'),\n corr('crew', 'Tonnage'),\n corr('crew', 'passengers')).show()\n\n#%%\n\nstring_indexed_data.select(\n corr('crew', 'length'),\n corr('crew', 'passenger_density'),\n corr('crew', 'cabins')).show()\n\n\n# ## Vector Assembler\n# - Grab all columns -> 1 single vector\n\n#%%\n\n# Define assembler\nassembler = VectorAssembler(\n inputCols=['Tonnage',\n 'passengers',\n 'length',\n 'cabins'],\n outputCol='features')\n\n# transform\nvector_indexed_data = assembler.transform(string_indexed_data)\nvector_indexed_data .select([\n 'Tonnage', 'passengers', 'length', 'cabins',\n 'features',\n 'crew']) \\\n .show(5)\n\n\n# ## Create dataset - Train/Test set\n\n#%%\n\n# X = features, y = crew\ndataset = vector_indexed_data.select('features', 'crew')\n\ndataset.show(5)\n\n#%%\n\ntrain_data, test_data = dataset.randomSplit([0.7, 0.3])\n\n\n# ## Linear Regression\n\n#%%\n\nlr = LinearRegression(\n featuresCol='features',\n labelCol='crew',\n predictionCol='prediction')\n\n#%%\n\n# Train\nmodel = lr.fit(train_data)\n\n\n# ## Model Summary\n\n#%%\n\nprint(\"Coefficients: {}\\nIntercept: {}\".format(\n model.coefficients, model.intercept))\n\n#%%\n\ntrainingSummary = model.summary", "original_comment": "# train RMSE, MSE\n", "target_code": "print(\"RMSE: {}\".format(trainingSummary.rootMeanSquaredError))\nprint(\"MSE: {}\".format(trainingSummary.meanSquaredError))\nprint(\"R2: {}\".format(trainingSummary.r2))\n", "project_metadata": {"full_name": "dangkhoadl/my-BigData", "description": "A cache to store my Distributed System and Big Data resources", "topics": ["big-data", "coursera", "operating-systems", "distributed-systems", "cloud-computing"], "git_url": "git://github.com/dangkhoadl/my-BigData.git", "stars": 7, "watchers": 7, "forks": 8, "created": "2017-12-23T05:56:43Z", "size": 49086, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 888066, "C++": 48288, "Shell": 6317, "Python": 3334, "Makefile": 990}, "last_updated": "2020-01-21T03:30:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Strongly agree", "precision-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "predictions = model.transform(test_data)\nevaluator = RegressionEvaluator(\n labelCol=\"crew\",\n predictionCol=\"prediction\",\n metricName=\"rmse\")\nrmse = evaluator.evaluate(predictions)\nprint(\"Root-mean-square error = \" + str(rmse))\n", "model": "natural", "intent": "# train RMSE, MSE"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n#%%\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n#%%", "original_comment": "# Download the raw-data files\n", "target_code": "get_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1gkgUlkaRXUzrNR_jY42ieK4xtLX3ztKX'\")\nget_ipython().system(\n \"unzip 'raw_data_files.zip' -d '/content/drive/My Drive/ICDMAI_Tutorial/'\")\n", "project_metadata": {"full_name": "amitbcp/icdmai_2020", "description": "This repository is for the Session held in International Conference on Data Management, Analytics and Innovation, New Delhi 2020", "topics": ["deeplearning", "recurrent-neural-networks", "rnn-pytorch", "word-embeddings", "text-classification", "rnns", "notebooks", "stackoverflow", "tag-recommender", "recommendation-system", "svm", "onevsrest"], "git_url": "git://github.com/amitbcp/icdmai_2020.git", "stars": 7, "watchers": 7, "forks": 2, "created": "2020-01-04T04:42:01Z", "size": 13078, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2676004}, "last_updated": "2021-01-06T14:44:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "warnings.filterwarnings('ignore')\n", "model": "no-comments", "intent": "# Download the raw-data files"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n\ndf['age'] = 2017 - df.yearbuilt\n\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n\ntrain.cluster_loc.value_counts()\n\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n\ntest.cluster_home.value_counts()\n\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n\nless_significant_clusters\n\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n\ntrain.head()\n\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n\ntrain_no_clusters.head()\n\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## acquire\n#\n# Zillow data:\n# - 2017 data\n# - Latest transaction per property id only.\n# - The logerror from that latest transaction.\n# - All fields related to the properties.\n# - Gather descriptions from the lookup tables.\n# - Only properties with latitude and longitude.\n# - Only single family homes.\n\n#%%\n\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.linear_model import SGDRegressor, LassoCV\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.svm import LinearSVR\nimport scipy as sp\nimport prepare\nimport summarize\nimport acquire\nimport warnings\nfrom mpl_toolkits.mplot3d import Axes3D\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import PowerTransformer, LabelEncoder, OneHotEncoder, QuantileTransformer, MinMaxScaler\nfrom sklearn.cluster import KMeans\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings(\"ignore\")\n\n#%%\n\ndf = acquire.get_zillow_data()\n\n\n# Only single family\n\n#%%\n\ndf = df[df.propertylandusedesc == 'Single Family Residential']\n\n\n# ## prepare\n#\n# ### Missing Values\n\n# - remove columns with > 99% missing and rows > 40% missing\n# - aggregate pool information: use all pool and spa columns to compute a single boolean attribute of `has_pool`\n# - fill with 0: taxdelinquencyflag, fireplacecnt, garagecarcnt and convert them to boolean\n# - After doing all that, then remove all columns with > 5% missing, and following that, rows with > 99% missing\n\n#%%\n\n# remove columns with > 99% missing and rows > 40% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.01, prop_required_row=.40)\n\n# aggregate pool information: use all pool and spa columns to compute a single attribute of pool_spa\n# gather pool columns\npool_cols = ['hashottuborspa', 'poolcnt',\n 'poolsizesum', 'pooltypeid2', 'pooltypeid7']\n# fill all missing values with 0\npool = df[pool_cols].fillna(0)\n# where there is a value in one or more of the pool attributes, assign a 1 to a new col named 'pool'\npool.loc[pool.sum(axis=1) > 0, 'has_pool'] = 1\n# append the new column to our original dataframe and remove the original pool columns\ndf = df.join(pool[['has_pool']])\n\n# fill with 0\ndf.loc[df.taxdelinquencyflag == 'Y', 'is_taxdelinquent'] = 1\ndf.loc[df.fireplacecnt > 0, 'has_fireplace'] = 1\ndf.loc[df.garagecarcnt > 0, 'has_garage'] = 1\nfill_with_0 = ['has_garage', 'has_fireplace', 'has_pool', 'is_taxdelinquent']\ndf[fill_with_0] = df[fill_with_0].fillna(0)\n\n# remove columns where > 5% missing and rows where > 99% missing\ndf = prepare.handle_missing_values(\n df, prop_required_column=.95, prop_required_row=.99)\n\n\n# ### Variable Changes\n#\n# Are there any instances where taxvaluedollarcnt is not equal to the sum of the land tax value and the structure tax value? (landtaxvaluedollarcnt + structuretaxvaluedollarcnt).\n\n#%%\n\nnp.where((df.taxvaluedollarcnt - (df.landtaxvaluedollarcnt +\n df.structuretaxvaluedollarcnt)) != 0)\n# add taxvaluedollarcnt to list to drop\n\n\n# No. I will attempt to reduce the dependency between variables and extracting the most unique information from each.\n#\n# - `land_dollar_per_sqft`: a land dollar per sqft (landtaxvaluedollarcnt/lotsizesquarefeet)\n# - `structure_dollar_per_sqft`: structuretaxvaluedollarcnt/calculatedfinishedsquarefeet\n# - `tax_rate`: taxvaluedollarcnt/taxamount\n# - compute `living_area_sqft` by subtracting estimated square feet from bedrooms (121: 11x11) and bathrooms (36: 6x6)\n# - compute `bedbath_index` where multiple bedrooms by a weight of 2, full baths by weight of 1, half/three-quarter baths by weight of .5, then sum them all together.\n#\n\n#%%\n\ndf['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt / \\\n df.calculatedfinishedsquarefeet\ndf['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet\ndf['living_area_sqft'] = df.calculatedfinishedsquarefeet - \\\n (df.bedroomcnt*121 + df.bathroomcnt*36)\ndf['tax_rate'] = df.taxvaluedollarcnt/df.taxamount\ndf['bedbath_index'] = df.bedroomcnt*2 + \\\n df.fullbathcnt + .5*(df.bathroomcnt-df.fullbathcnt)\n\n\n# - turn yearbuilt into age (from present)\n# - reduce regionidcity into the top 5 cities and the others assign to a catch-all id.\n# - take the first 3 digits of zip to reduce the variance in zipcode\n# - Look at variables that don't actually represent numeric values to think about encoding. (fips, regionidcity, regionidzip, regionidcounty)\n\n#%%\n\ndf['age'] = 2017 - df.yearbuilt\n\n#%%\n\ndf.loc[:, 'latitude'] = df.loc[:, 'latitude']/1e6\ndf.loc[:, 'longitude'] = df.loc[:, 'longitude']/1e6\n\n\n# City ID and County: Is there any cross-over or is city purely a subset of county?\n#\n# Count the number of counties each city is located in:\n\n#%%\n\nct = pd.DataFrame(pd.crosstab(df.regionidcity, df.regionidcounty))\ns = ct.astype(bool).sum(axis=1)\ns = s.where(s > 1).dropna()\npd.crosstab(df[df.regionidcity.isin(list(s.index))].regionidcity,\n df[df.regionidcity.isin(list(s.index))].regionidcounty)\n\n\n# Taking a look at these, I can see that when there are multiple counties, there is clearly a dominant county and only a handful of properties in the other. I will 'fix' the anomalies to be in what is likely the correct county. I'll test it here, but will need to implement above before we do all the prepping.\n\n#%%\n\ndf.loc[df.regionidcity.isin([5465.0, 12447.0, 12520.0]),\n 'regionidcounty'] = 3101.0\ndf.loc[df.regionidcity.isin(\n [10608.0, 15237.0, 18874.0, 44833.0]), 'regionidcounty'] = 1286.0\ndf.loc[df.regionidcity == 41673.0, 'regionidcounty'] = 2061.0\ndf.regionidcounty.value_counts()\n\n\n# - Looking at the counts for each county, it seems reasonable to not split county 2061 geographically much more than that.\n# - County 1286 can probably be split more effectively: city 16764 and all others\n# - County 3101 can definitedly be split more effectively: city 12447, 5534, 46298, 40227, and all others\n#\n# However, I'm going to wait to do this. I will first run some statistical tests to see if there are cities and zips that have significantly different logerror from the rest of the properties.\n\n#%%\n\n# df.loc[(df['regionidcity']==12447) | (df['regionidcity']==5534) | (df['regionidcity']==40227) | (df['regionidcity']==46298) | (df['regionidcity']==16764), 'cityid'] = df['regionidcity']\n# df.cityid.fillna(0, inplace=True)\n\n#%%\n\n# have to do float first because of an issue with 0.0, then int, then string to ensure no decimals in the string.\n# df['cityid'] = df.cityid.astype(float).astype(int).astype(str)\ndf['regionidcity'] = df.regionidcity.astype(float).astype(int).astype(str)\ndf['regionidzip'] = df.regionidzip.astype(float).astype(int).astype(str)\ndf['regionidcounty'] = df.regionidcounty.astype(float).astype(int).astype(str)\n\n#%%\n\n# df['loc_id'] = df.regionidcounty + '_' + df.cityid\n\n#%%\n\n# clean up remaining columns\ndf_prepped = df.drop(columns=(['id', 'parcelid', 'assessmentyear', 'propertycountylandusecode',\n 'propertylandusedesc', 'transactiondate', 'propertylandusetypeid',\n 'finishedsquarefeet12', 'taxvaluedollarcnt', 'fips',\n 'yearbuilt', 'rawcensustractandblock', 'censustractandblock', 'roomcnt',\n 'calculatedbathnbr', 'taxamount', 'calculatedfinishedsquarefeet',\n 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt',\n 'bedroomcnt', 'bathroomcnt', 'fullbathcnt'\n ]))\n\n#%%\n\ndf_prepped.shape\n\n\n# ### Split Data\n\n#%%\n\ntrain, test = train_test_split(df_prepped, test_size=.30)\n\n\n# ### Scaling\n\n# #### Create Uniform Scaler\n# when we the space between doesn't matter as much as order does, a uniform scaler is a good choice.\n\n#%%\n\ndef scale_uniform(train, test, column_list):\n scaler = QuantileTransformer(\n output_distribution='uniform', random_state=123)\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# #### Create MinMax Scaler\n# When we want to preserve distance but want to be within bounds, a min-max scaler is a good choice.\n\n#%%\n\ndef scale_minmax(train, test, column_list):\n scaler = MinMaxScaler(feature_range=(0, 1))\n train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]),\n columns=column_list,\n index=train.index)\n train.drop(columns=column_list, inplace=True)\n train = train.join(train_scaled)\n\n test_scaled = pd.DataFrame(scaler.transform(test[column_list]),\n columns=column_list,\n index=test.index)\n test.drop(columns=column_list, inplace=True)\n test = test.join(test_scaled)\n\n return train, test, scaler\n\n\n# We will scale square feet, dollar per square foot, tax rate, beds & baths using a uniform scaler as that will help minimize the impact extreme outliers will have.\n#\n# For latitude, longitude and age, we want to preserve the distance between. We want 1876 to be futher away from the next oldest house of 1900 than 1900 is from the next oldest house of 1901. For this reason, we will use a min-max scaler.\n\n#%%\n\ncolumn_list1 = ['lotsizesquarefeet', 'structure_dollar_per_sqft',\n 'land_dollar_per_sqft', 'living_area_sqft', 'tax_rate', 'bedbath_index']\ntrain, test, scaler_uniform = scale_uniform(train, test, column_list1)\n\ncolumn_list2 = ['latitude', 'longitude', 'age']\ntrain, test, scaler_minmax = scale_minmax(train, test, column_list2)\n\n#%%\n\n# train_prepped.describe().T\ntrain.info()\n\n\n# ## Cluster\n#\n# ### K-Means\n#\n# #### Elbow Method to determine best 'K'\n\n# 1. Let's first cluster by geolocation of latitude and longitude along with dollar per square foot (land and structure) and tax rate.\n#\n# 2. Then we will cluster by lot size, living area, beds and baths, and age.\n#\n# ##### Clustering 1: location, dollar/sqft, tax rate\n\n#%%\n\ncluster1_cols = ['latitude', 'longitude',\n 'land_dollar_per_sqft', 'structure_dollar_per_sqft']\n\n\n# Compute and plot the sum squared distances of each sample to closest cluster center at each k-value.\n\n#%%\n\ndef select_k(cluster_df, ks):\n sse = []\n for k in ks:\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(cluster_df)\n\n # inertia: Sum of squared distances of samples to their closest cluster center.\n sse.append(kmeans.inertia_)\n\n # print(pd.DataFrame(dict(k=ks, sse=sse)))\n\n p = plt.plot(ks, sse, 'bx-')\n p = plt.xlabel('k')\n p = plt.ylabel('SSE')\n p = plt.title('The Elbow Method to find the optimal k')\n\n compare_df = pd.DataFrame(dict(k=ks, sse=sse)).assign(\n change_in_sse=lambda df: df.sse.diff())\n return compare_df, p\n\n#%%\n\nselect_k(cluster_df=train[cluster1_cols], ks=range(1, 13))\n\n\n# I would say 6 or 8 is hwere the bottom of the elbow sits.\n# Let's compare k=6 vs. k=8.\n\n#%%\n\ndef compare_clusters(cluster_df, x_column, y_column, z_column, k1, k2):\n estimators = [(str(k1)+' Clusters', KMeans(n_clusters=k1, n_init=1, max_iter=100, random_state=123)),\n (str(k2)+' Clusters', KMeans(n_clusters=k2, n_init=1, max_iter=100, random_state=123))]\n\n fig, axs = plt.subplots(1, 2, figsize=(\n 14, 6), subplot_kw={'projection': '3d'})\n\n for ax, (title, kmeans) in zip(axs, estimators):\n # fit the kmeans object\n kmeans.fit(cluster_df)\n\n labels = kmeans.labels_\n\n ax.scatter(cluster_df[x_column],\n cluster_df[y_column],\n cluster_df[z_column],\n c=labels.astype(np.float), edgecolor='k')\n ax.set(xticklabels=[], yticklabels=[], zticklabels=[])\n ax.set(xlabel=x_column, ylabel=y_column, zlabel=z_column)\n ax.set(title=title)\n\n plt.show()\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='land_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='longitude',\n k1=6, k2=8)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster1_cols],\n x_column='latitude', y_column='structure_dollar_per_sqft', z_column='land_dollar_per_sqft',\n k1=6, k2=8)\n\n\n# As latitude moves east (left on the chart), we can see the land dollar per sqft and structure dollar per sqft increases, indicated by the slope upward as you move back and to the left. (low latitude, high structure dollar per sqft, high land dollar per sqft.\n\n# I'm going to go with 8.\n\n#%%\n\ndef create_k_clusters(train, test, cluster_feature_id, cluster_cols, k):\n kmeans = KMeans(n_clusters=k, n_init=1, max_iter=100, random_state=123)\n kmeans.fit(train[cluster_cols])\n cluster_feature = 'cluster'+str(cluster_feature_id)+'_id'\n train[cluster_feature] = kmeans.predict(train[cluster_cols])\n test[cluster_feature] = kmeans.predict(test[cluster_cols])\n return train, test, kmeans\n\n#%%\n\ntrain, test, kmeans1 = create_k_clusters(\n train, test, cluster_feature_id=1, cluster_cols=cluster1_cols, k=8)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\ntest.rename(index=str, columns={'cluster1_id': 'cluster_loc'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans1.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\nfor i in range(0, len(cluster1_cols)):\n for j in range(0, len(cluster1_cols)):\n sns.relplot(\n data=train, x=cluster1_cols[i], y=cluster1_cols[j], hue='cluster')\n\n#%%\n\ntrain.cluster_loc.value_counts()\n\n#%%\n\ntest.cluster_loc.value_counts()\n\n\n# ##### Clustering 2: size fields and age\n\n#%%\n\ncluster2_cols = ['lotsizesquarefeet',\n 'living_area_sqft', 'bedbath_index', 'age']\n\n#%%\n\nselect_k(cluster_df=train[cluster2_cols], ks=range(1, 13))\n\n\n# Compare 5 vs. 7 clusters\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='bedbath_index',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='living_area_sqft', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='lotsizesquarefeet', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n#%%\n\ncompare_clusters(cluster_df=train[cluster2_cols],\n x_column='living_area_sqft', y_column='bedbath_index', z_column='age',\n k1=5, k2=7)\n\n\n# I'm going to go with 7 clusters.\n\n#%%\n\ntrain, test, kmeans2 = create_k_clusters(\n train, test, cluster_feature_id=2, cluster_cols=cluster2_cols, k=7)\n\n#%%\n\ntrain.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\ntest.rename(index=str, columns={'cluster2_id': 'cluster_home'}, inplace=True)\n\n#%%\n\ntrain['cluster'] = kmeans2.labels_\ntrain.cluster = 'cluster_' + (train.cluster + 1).astype('str')\n\nfor i in range(0, len(cluster2_cols)):\n for j in range(0, len(cluster2_cols)):\n sns.relplot(\n data=train, x=cluster2_cols[i], y=cluster2_cols[j], hue='cluster')\n\n\n# ### Feature Selection\n\n# #### Which variables and clusters give information about logerror?\n#\n# First, I need to confirm that 'logerror' is normally distributed, to determine if I can run t-tests to test the differences in means across different clusters.\n\n#%%\n\nplt.hist(train.logerror, bins=1000)\nplt.show()\n\n\n# Looks good!\n\n# **Test:** Home driven clusters\n#\n# First, let's look at the mean log error by cluster id. We will do this for both the train and test as more of a data quality check...to confirm that our cluster id's are showing similar results in both samples.\n\n#%%\n\nprint(pd.DataFrame(train.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\nprint(pd.DataFrame(test.groupby(['cluster_home'])[\n 'logerror'].mean().reset_index()))\n\n\n# Test the different in the mean logerror for each cluster vs all others. When the p-value is < .05 then we keep the cluster id, else we replace the cluster id with -1.\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_home)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_home == i].logerror.dropna(),\n train[train.cluster_home != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\ntrain.cluster_home = train.cluster_home.replace(less_significant_clusters, -1)\ntest.cluster_home = test.cluster_home.replace(less_significant_clusters, -1)\n\n#%%\n\ntest.cluster_home.value_counts()\n\n#%%\n\ntrain.cluster_home.value_counts()\n\n\n# **Test:** Location driven clusters\n\n#%%\n\npd.DataFrame(train.groupby(['cluster_loc'])['logerror'].mean().reset_index())\n\n#%%\n\nless_significant_clusters = []\n\nfor i in range(0, max(train.cluster_loc)+1):\n stat, pval = sp.stats.ttest_ind(\n train[train.cluster_loc == i].logerror.dropna(),\n train[train.cluster_loc != i].logerror.dropna())\n if pval > .05:\n less_significant_clusters = less_significant_clusters + [i]\n\n#%%\n\nless_significant_clusters\n\n#%%\n\ntrain.cluster_loc = train.cluster_loc.replace(less_significant_clusters, -1)\ntest.cluster_loc = test.cluster_loc.replace(less_significant_clusters, -1)\n\n\n# **Test:** is_taxdelinquent\n\n#%%\n\npd.DataFrame(train.groupby(['is_taxdelinquent'])\n ['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.is_taxdelinquent == 0].logerror.dropna(),\n train[train.is_taxdelinquent == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['is_taxdelinquent'], inplace=True)\n test.drop(columns=['is_taxdelinquent'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# if pval < 0.05 then the column should still exist:\n'is_taxdelinquent' in train.columns\n\n\n# **Results:** is_taxdelinquent was correctly left as feature.\n#\n# _________________________\n#\n# **Test:** has_pool\n\n#%%\n\npd.DataFrame(train.groupby(['has_pool'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_pool == 0].logerror.dropna(),\n train[train.has_pool == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_pool'], inplace=True)\n test.drop(columns=['has_pool'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_pool' in train.columns\n\n\n# **Results:** has_pool was correctly left as feature\n#\n# ____________________________\n#\n# **Test:** has_fireplace\n\n#%%\n\npd.DataFrame(train.groupby(['has_fireplace'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_fireplace == 0].logerror.dropna(),\n train[train.has_fireplace == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_fireplace'], inplace=True)\n test.drop(columns=['has_fireplace'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n# verify column exists if pval < 0.05, and not if greater\n'has_fireplace' in train.columns\n\n\n# **Results:** has_fireplace was correctly removed\n# _______________________\n#\n# **Test:** has_garage\n\n#%%\n\npd.DataFrame(train.groupby(['has_garage'])['logerror'].mean().reset_index())\n\n#%%\n\nstats, pval = sp.stats.ttest_ind(\n train[train.has_garage == 0].logerror.dropna(),\n train[train.has_garage == 1].logerror.dropna())\n\nif pval > .05:\n train.drop(columns=['has_garage'], inplace=True)\n test.drop(columns=['has_garage'], inplace=True)\n\n\n# **Data Validation**\n\n#%%\n\nprint(pval)\n\n# verify column exists if pval < 0.05, and not if greater\n'has_garage' in train.columns\n\n\n# **Results:** has_garage was correctly left as a feature\n#\n# _______________________\n#\n# **Clean up remaining features**\n\n#%%\n\ntrain.head()\n\n#%%\n\ncols_to_remove = ['tax_rate', 'regionidcity', 'regionidzip']\nnon_cluster_features = ['lotsizesquarefeet', 'structure_dollar_per_sqft', 'land_dollar_per_sqft',\n 'living_area_sqft', 'bedbath_index', 'latitude', 'longitude', 'age']\ntrain_no_clusters = train[non_cluster_features+['regionidcounty', 'logerror']]\ntest_no_clusters = test[non_cluster_features+['regionidcounty', 'logerror']]\n\n#%%\n\ncols_to_remove = cols_to_remove + non_cluster_features\ntrain_clusters = train.drop(columns=cols_to_remove)\ntest_clusters = test.drop(columns=cols_to_remove)\n\n#%%\n\ntrain_no_clusters.head()\n\n#%%\n\ntrain_clusters.head()\n\n\n# ### Encode\n#\n# Which columns are of numeric format but represent classes or categories?\n# fips\n# rawcensustractandblock\n# regionidcity\n# regionidcounty\n# regionidzip\n#\n\n#%%\n\ndef encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # Integer Encoding\n int_encoder = LabelEncoder()\n train.encoded = int_encoder.fit_transform(train[col_name])\n test.encoded = int_encoder.transform(test[col_name])\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train.encoded).reshape(len(train.encoded), 1)\n test_array = np.array(test.encoded).reshape(len(test.encoded), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, int_encoder, ohe\n\n\ndef one_hot_encode(train, test, col_name):\n\n encoded_values = sorted(list(train[col_name].unique()))\n columns = [col_name + '_' + str(val) for val in encoded_values]\n\n # create 2D np arrays of the encoded variable (in train and test)\n train_array = np.array(train[col_name]).reshape(len(train[col_name]), 1)\n test_array = np.array(test[col_name]).reshape(len(test[col_name]), 1)\n\n # One Hot Encoding\n ohe = OneHotEncoder(sparse=False, categories='auto')\n train_ohe = ohe.fit_transform(train_array)\n test_ohe = ohe.transform(test_array)\n\n # Turn the array of new values into a data frame with columns names being the values\n # and index matching that of train/test\n # then merge the new dataframe with the existing train/test dataframe\n train_encoded = pd.DataFrame(data=train_ohe,\n columns=columns, index=train.index)\n train = train.join(train_encoded)\n\n test_encoded = pd.DataFrame(data=test_ohe,\n columns=columns, index=test.index)\n test = test.join(test_encoded)\n\n return train, test, ohe\n\n#%%\n\ntrain_clusters, test_clusters, ohe_loc = one_hot_encode(\n train_clusters, test_clusters, 'cluster_loc')\ntrain_clusters, test_clusters, ohe_home = one_hot_encode(\n train_clusters, test_clusters, 'cluster_home')\n\n\n# Build 3 models, 1 for each county\n# Try with clusters and then try with original features\n\n#%%\n\ntrain_clusters.regionidcounty.value_counts()\n\n\n# Separate the clusters dataframes by county\n\n#%%\n\ntrain_3101_c = train_clusters[train_clusters.regionidcounty == '3101']\ntest_3101_c = test_clusters[test_clusters.regionidcounty == '3101']\n\ntrain_1286_c = train_clusters[train_clusters.regionidcounty == '1286']\ntest_1286_c = test_clusters[test_clusters.regionidcounty == '1286']\n\ntrain_2061_c = train_clusters[train_clusters.regionidcounty == '2061']\ntest_2061_c = test_clusters[test_clusters.regionidcounty == '2061']\n\n\n# Separate the non-clusters dataframes by county\n\n#%%\n\ntrain_3101_nc = train_no_clusters[train_no_clusters.regionidcounty == '3101']\ntest_3101_nc = test_no_clusters[test_no_clusters.regionidcounty == '3101']\n\ntrain_1286_nc = train_no_clusters[train_no_clusters.regionidcounty == '1286']\ntest_1286_nc = test_no_clusters[test_no_clusters.regionidcounty == '1286']\n\ntrain_2061_nc = train_no_clusters[train_no_clusters.regionidcounty == '2061']\ntest_2061_nc = test_no_clusters[test_no_clusters.regionidcounty == '2061']\n\n\n# Now that we have encoded and split by county, we can remove some other columns to have our final X with features.\n# We only need a y_train and y_test for each county, not separated by the features, obviously.\n#\n# X dataframes for the cluster features:\n\n#%%\n\ncols_to_drop = ['cluster_loc', 'cluster_home', 'cluster_loc_-1',\n 'cluster_home_-1', 'logerror', 'regionidcounty']\n\nX_train_3101_c = train_3101_c.drop(columns=cols_to_drop)\nX_test_3101_c = test_3101_c.drop(columns=cols_to_drop)\n\nX_train_1286_c = train_1286_c.drop(columns=cols_to_drop)\nX_test_1286_c = test_1286_c.drop(columns=cols_to_drop)\n\nX_train_2061_c = train_2061_c.drop(columns=cols_to_drop)\nX_test_2061_c = test_2061_c.drop(columns=cols_to_drop)\n\n\n# X dataframes for the non-cluster features\n\n#%%\n\ncols_to_drop = ['logerror', 'regionidcounty']\n\nX_train_3101_nc = train_3101_nc.drop(columns=cols_to_drop)\nX_test_3101_nc = test_3101_nc.drop(columns=cols_to_drop)\n\nX_train_1286_nc = train_1286_nc.drop(columns=cols_to_drop)\nX_test_1286_nc = test_1286_nc.drop(columns=cols_to_drop)\n\nX_train_2061_nc = train_2061_nc.drop(columns=cols_to_drop)\nX_test_2061_nc = test_2061_nc.drop(columns=cols_to_drop)\n\n\n# y dataframes\n\n#%%\n\ny_train_3101 = train_3101_c[['logerror']]\ny_test_3101 = test_3101_c[['logerror']]\n\ny_train_1286 = train_1286_c[['logerror']]\ny_test_1286 = test_1286_c[['logerror']]\n\ny_train_2061 = train_2061_c[['logerror']]\ny_test_2061 = test_2061_c[['logerror']]\n\n\n# ## Model\n\n#%%\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_c)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_c, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_c)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# #### non-clustering features\n#\n# ##### Linear Support Vector Regressor from sklearn.svm\n\n#%%\n\nregr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = regr.predict(X_train_3101_nc)\nprint(mean_squared_error(y_train_3101, y_pred_3101)**1/2)\n\n\n# ##### Support Gradient Descent Regressor from sklearn.linear_model\n\n#%%\n\nsgd = SGDRegressor(fit_intercept=False, max_iter=1000, random_state=123)\nsgd.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = sgd.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Lasso with Cross Validation from sklearn.linear_model\n\n#%%\n\nlasso = LassoCV(fit_intercept=False)\nlasso.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = lasso.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# ##### Decision Tree Regressor from sklearn.tree\n\n#%%\n\ndt = DecisionTreeRegressor(random_state=123)\ndt.fit(X_train_3101_nc, y_train_3101)\ny_pred_3101 = dt.predict(X_train_3101_nc)\nmean_squared_error(y_train_3101, y_pred_3101)**1/2\n\n\n# WOW!!!\n\n# ### county: 3101\n#\n# #### Baseline\n\n#%%\n\nse = y_train_3101.logerror * y_train_3101.logerror\nmse = se.mean()\nrmse = mse**1/2\nrmse\n\n\n# #### Clustering Features\n#", "original_comment": "# ##### Linear Support Vector Regressor from sklearn.svm\n", "target_code": "regr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_3101_c, y_train_3101)\n", "project_metadata": {"full_name": "CodeupClassroom/bayes-methodologies-exercises", "description": "Bayes exercises on methodologies", "topics": [], "git_url": "git://github.com/CodeupClassroom/bayes-methodologies-exercises.git", "stars": 5, "watchers": 5, "forks": 3, "created": "2019-10-09T14:04:48Z", "size": 13779, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 17490873, "Python": 71621}, "last_updated": "2020-01-06T20:54:05Z"}, "annotations": [{"completed_by": {"id": 1}, "compatibility": "Disagree", "compatibility-score": 1, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "compatibility": "Disagree", "compatibility-score": 1, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2}], "predicted_code": "regr = LinearSVR(random_state=123, tol=1e-5,\n loss='squared_epsilon_insensitive', fit_intercept=False, dual=False)\nregr.fit(X_train_nc, y_train_nc)\ny_pred_nc = regr.predict(X_train_nc)\nmean_squared_error(y_train_nc, y_pred_nc)**1/2\n", "model": "natural", "intent": "# Linear Support Vector Regressor from sklearn.svm"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n\nimport math\nx = 1 + 1\nx\n\n\ny = 1.0 * 2\ny\n\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n\n# Classic division\n3 / 5\n\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n\n5 % 2 # Remainder of number 1 / number 2\n\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n\nx = 1\ny = 1.0\nz = 2\n\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # iPython Notebooks, Continious Integrations Environements and Operators\n\n# ## iPython (Jupyter Notebook)\n#\n# As a quick aside, these notebooks can all be run locally. This is the first week that larger sections of code are being shipped in the notebook and you may want to play around, change some numbers, and experiment. To do this:\n#\n# 1. Clone this repository locally to your computer using the standard `git clone REPO_URL` syntax.\n# 1. Open up a terminal or powershell and `cd` into the cloned directory. For example, if I cloned the repository to /Users/jay/Desktop/week4, I would open the terminal and execute `cd /Users/jay/Desktop/week4` or `cd ~/Desktop/week4`.\n# 1. Once you are in directory, type `jupyter notebook` and the jupyter interface should load. This should automatically open a web browser with the following page:\n#\n# \n#\n# 1. Click the Week4.ipynb link and this notebook will open.\n# 1. To execute a cell, hold `shift` and press `return`.\n\n# ## CI\n# Continious Integration is simply a development practice, where a team of developers are integrating code into a centralized repository at some interval. With ever checkin, automated testing (unit and/or functional) is run. This development model allows for the detection of merge (the act of integrating 2+ persons' code together) issues or bugs with every code update.\n#\n# **TODO:** To get more familiar with CI, please read [this wonderful Martin Fowler article](http://www.martinfowler.com/articles/continuousIntegration.html).\n\n# ### How have we been using CI?\n#\n# We have been using CI since week 2, just not for the code integration aspects. Instead, we have been using CI to run a suite of automated tests (the Koans) in a TDD environment. Take for example the following image:\n#\n# \n#\n# You are the developers (1) working on writing code and commiting that code to a version control system (Git / Github). Once the code is pushed to Github, you are submitting a pull request to have your changes integrated into the main development branch (2). This causes the continious integration environment (3) to provision a virtual machine in the cloud, spool up, clone your code, and run the automated test suite (4). Once this is done, the CI environment alerts me and I can check the test results (6). If we were to take everyone in the class, and divide the development work into teams, the utilization of the code repository and CI environment would not change.\n#\n# Image from: http://decks.eric.pe/pantheon-ci/images/ci-architecture.png\n\n# ### Currently Available CI Environments\n# In previous week we looked at github and a DVCS. This is a key component of a CI environment. The other key component are tests, which we develop first in a TDD environment, and a CI Server. Popular CI Servers include [Travis-CI](https://travis-ci.com), [Jenkins](https://jenkins-ci.org), [Appveyor](https://www.appveyor.com) or [BuildBot](http://buildbot.net). My first preference, for builds of open source software without long test cycles (say a build less than 30 minutes) is Travis. Travis is freely available, integrates well with Github, supports Linux and OS X and does not require much setup. We will look more at Travis below. Frequently, the software we develop needs to run on Windows as well. This is where Appveyor steps in. Appveyor is, in many way the Travis of Windows and simply requires that an additional configuration script be created.\n#\n# What happens when the software is larger, proprietary, or not open source. This is where Jenkins comes in. Jenkins can be installed on your own server, with the necessary proprietary software already installed (ArcMap anyone?), and hooks can be used to pull code from your code repository for testing. You install and maintain Jenkins. This equates to additional development time spent working with CI.\n#\n#\n# #### Travis:\n# \n#\n# Getting started with Travis is [easy](https://docs.travis-ci.com/user/getting-started/):\n#\n# 1. Login with your github credentials and allow Travis to access your repositories.\n# 1. Activate a repository\n# 1. Add a .travis.yml to the top level of your code repository.\n#\n# Here is the `.travis.yml` script that we used in week 1. It simply says that we want to test in a Python 3.5 environment and that the script to be run is `nosetests`.\n#\n# ```yml\n# language: python\n# python:\n# - \"3.5\"\n#\n# #command to run tests\n# script: nosetests\n# ```\n#\n# It is equally easy to specify a different script. For example, here is a .travis.yml that builds the GEOS library. (Yes, this is a build.sh that could build any number of libraries.)\n# ```yml\n# #!/bin/sh\n#\n# ./configure --prefix=$PREFIX\n#\n# make\n# make install\n# ```\n# The point is that Travis is not limited to Python, but is able to build Fortran, C, C++, Objective-C (works for Swift as well), Ruby, Go, etc.\n\n# ## Operators / Operands\n\n# This week, we are focusing on Python operators. In general mathematical operators are going to behave precisely how you would expect them to. Here is a list of the operators, with the operators at the top taking precidence over the operators at the bottom (e.g. the order of operators moved from top to bottom).\n#\n# \n#\n# Notice that [PEMDAS](http://www.mathsisfun.com/operation-order-pemdas.html), is right in there, though split by function calls, slicing, and some bitwise operators.\n\n# ### Math: Just what you would expect\n\n#%%\n\nimport math\nx = 1 + 1\nx\n\n#%%\n\ny = 1.0 * 2\ny\n\n#%%\n\nx = (2 + 1)**2 # Exponentiation\nx\n\n\n# How about something a little more complex: $7 + (3 x 4^{2} - 1)$\n\n#%%\n\n7 + (3 * 4 ** 2 - 1)\n\n\n# How about translating the formula for the area of a circle into code?\n#\n# Formula: $A = \\pi r^{2}$\n\n#%%\n\nr = 2.0\npi = 3.14 # Bad approximation\n\na = pi * r ** 2\na\n\n\n# How about being a little bit more precise with pi?\n\n#%%\n\nmath_pi = math.pi\nr2 = 2.0\n\na2 = math_pi * r2 ** 2\na2\n\n#%%\n\ndifference = a2 - a\ndifference # Not too off, it all depends on the application\n\n\n# #### Division\n\n#%%\n\n# Classic division\n3 / 5\n\n#%%\n\n3 / 5.0 # Float not required in Python 3, but is in Python 2.x\n\n#%%\n\n5 % 2 # Remainder of number 1 / number 2\n\n#%%\n\n# What if we want both the divisor and any remainder\ndivmod(5, 2)\n\n\n# ### Comparison and Membership\n\n#%%\n\nx = 1\ny = 1.0\nz = 2\n\n#%%\n\nprint(x == y) # Does x equal y, return a boolean\nprint(x == z) # Likewise, does x equal z\nprint(x != z) # Does x not equal z\n\n#%%\n\n# Less than\nprint(x < z)\n# Greater than or equal to\nprint(x >= y)\n\n\n# Note that `<>` (not equal) no longer works in Python 3 (thankfully).\n\n#%%\n\n# A list of numbers, we will talk about lists in a coming lesson, just trust me for now\nx = [1, 2, 3, 4, 5]\n\n# Check if 1 is in x\nprint(1 in x)", "original_comment": "# Check if 0 is in x\n", "target_code": "print(0 in x)\n", "project_metadata": {"full_name": "jlaura/GIS321", "description": null, "topics": [], "git_url": "git://github.com/jlaura/GIS321.git", "stars": 5, "watchers": 5, "forks": 15, "created": "2016-01-11T03:36:14Z", "size": 3772, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 250997}, "last_updated": "2017-04-06T06:32:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "x = [1, 2, 3, 4, 5]\nprint(1 in x)\n", "model": "no-comments", "intent": "# Check if 0 is in x"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Visualizing High-Dimensional Datasets with Tensorboard's Embedding Projector\n#\n# ![](projector_screenshot.png)\n\n# ### What's an embedding?\n# - \"a mapping from discrete objects to vectors of real numbers.\"\n# - tries to capture the information of a system in a (usually) high-dimensional vector space\n# - often the input/output for machine learning models\n#\n# **Example:** a phase-space embedding of particles in a simulation\n# ![](https://upload.wikimedia.org/wikipedia/commons/f/f7/Hamiltonian_flow_classical.gif)\n#\n# **or:** a 300-dimensional embedding of English words\n# ```\n# blue: (0.01359, 0.00075997, 0.24608, ..., -0.2524, 1.0048, 0.06259)\n# blues: (0.01396, 0.11887, -0.48963, ..., 0.033483, -0.10007, 0.1158)\n# orange: (-0.24776, -0.12359, 0.20986, ..., 0.079717, 0.23865, -0.014213)\n# oranges: (-0.35609, 0.21854, 0.080944, ..., -0.35413, 0.38511, -0.070976)\n# ```\n#\n\n# ### Google's Embedding Projector\n# [Embedding projector tutorial](https://www.tensorflow.org/guide/embedding)\n#\n# **Some terminology:**\n# - Tensorflow is Google's machine learning framework\n# - Tensorboard is Tensorflow's visualization suite\n# - The embedding projector is a tool inside of Tensorboard\n#\n# [Original embedding projector paper](https://arxiv.org/pdf/1611.05469v1.pdf)\n# - Authors find three common tasks:\n# ![](embedding_projector_tasks.png)\n#\n#\n# Standalone projector: https://projector.tensorflow.org\n# - [Wikipedia: Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n#\n#\n#\n#\n#\n#\n\n#\n#\n#\n#\n# ### How can we load in our own data?\n\n#%%\n\n# import tensorflow and embedding projector\nfrom sklearn import mixture\nimport gensim.models.word2vec as word2vec\nimport seaborn as sns\nimport tensorflow as tf\nfrom tensorflow.contrib.tensorboard.plugins import projector\n\n# other stuff\nimport numpy as np\nimport pandas as pd\nimport pathlib # pathlib2 if in Python 2\n\n# function to load data into tensorboard format\n\n\ndef to_tensorboard(name, vectors, metadata, output_dir='tensorboard'):", "original_comment": " # make sure output directory exists\n", "target_code": " output_dir = pathlib.Path(output_dir)\n output_dir.mkdir(exist_ok=True)\n", "project_metadata": {"full_name": "thehackerwithin/illinois", "description": "THW Chapter at U. Illinois", "topics": [], "git_url": "git://github.com/thehackerwithin/illinois.git", "stars": 13, "watchers": 13, "forks": 31, "created": "2015-02-18T19:38:33Z", "size": 61361, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 868658, "TeX": 34143, "R": 18922, "HTML": 10291, "Julia": 5254, "Python": 4028, "C++": 425, "CMake": 94}, "last_updated": "2020-09-30T18:16:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "if not pathlib.Path(output_dir).exists():\n pathlib.Path(output_dir).mkdir()\n", "model": "natural", "intent": " # make sure output directory exists"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n\n# Import the Python machine / deep learning libraries:\n\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 10 - \"Long Short-Term Memory (LSTM) Neural Networks\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will learn how to apply another type of deep learning technique referred to as **Long Short-Term Memory (LSTM)** neural networks. Unlike standard feedforward neural networks, LSTMs encompass feedback connections that make it a \"general-purpose computer\". LSTMs are designed to process not only a single data point (such as images), but also entire sequences of data, e.g., such as speech, video, or financial time series.\n#\n#\n# We will again use the functionality of the **PyTorch** library to implement and train an LSTM based neural network. The network will be trained on the historic daily (in-sample) returns of an exemplary financial stock. Once the network is trained, we will use the learned model to predict future (out-of-sample) returns. Finally, we will convert the predictions into tradable signals and the backtest the signals accordingly.\n#\n# The figure below illustrates a high-level view on the machine learning process we aim to establish in this lab.\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email via marco.schreyer@fds.ai or damian.borth@fds.ai.\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand the basic concepts, intuitions and major building blocks of **Long Short-Term Memory (LSTM) Neural Networks**.\n# > 2. Know how to **implement and to train an LSTM** to learn a model of financial time-series data.\n# > 3. Understand how to apply such a learned model to **predict future data points of a time-series**.\n# > 4. Know how to **interpret the model's prediction results** and backtest the predictions.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport itertools\nimport os\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import dataloader\nfrom torch.utils import data\nimport torch.optim as optim\nimport torch.nn as nn\nimport torch\nimport bt as bt # library to backtest trading signals\nimport numpy as np\nimport pandas_datareader as dr\nimport pandas as pd\nimport datetime as dt\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2016: \"The Deep Learning Revolution\" Opening in Keynote\"\nYouTubeVideo('Dy0hJWltsyE', width=800, height=400)\n\n\n# ### Setup of the Jupyter Notebook Environment\n\n# Suppress potential warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualization. We will mostly use the `PyTorch`, `Numpy`, `Sklearn`, `Matplotlib`, `Seaborn`, `BT` and a few utility libraries throughout the lab:\n\n#%%\n\n# import python data science and utility libraries\n\n\n# Import the backtesting library:\n\n#%%\n\n# Import the Python machine / deep learning libraries:\n\n#%%\n\n# pytorch libraries\n\n\n# Import Python plotting libraries and set general plotting parameters:\n\n#%%\n\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable notebook matplotlib inline plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Create notebook folder structure to store the data as well as the trained neural network models:\n\n#%%\n\nif not os.path.exists('./data'):\n os.makedirs('./data') # create data directory\nif not os.path.exists('./models'):\n os.makedirs('./models') # create trained models directory\n\n\n# Set random seed value to obtain reproducable results:\n\n#%%\n\n# init deterministic seed\nseed_value = 1234\nnp.random.seed(seed_value) # set numpy seed\ntorch.manual_seed(seed_value) # set pytorch seed CPU\n\n\n# Enable GPU computing by setting the `device` flag and init a `CUDA` seed:\n\n#%%\n\n# set cpu or gpu enabled device\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type\n\n# init deterministic GPU seed\ntorch.cuda.manual_seed(seed_value)\n\n# log type of device enabled\nnow = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\nprint('[LOG {}] notebook with \\'{}\\' computation enabled'.format(\n str(now), str(device)))\n\n\n# ### 1. Dataset Download and Data Assessment\n\n# In this section of the lab notebook we will download and access historic daily stock market data ranging from **01/01/2000** to **31/12/2017** of the **\"International Business Machines\" (IBM)** corporation (ticker symbol: \"IBM\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API.\n#\n# To start the data download, let's specify the start and end date of the stock market data download:\n\n#%%\n\nstart_date = dt.datetime(2000, 1, 1)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the daily \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nstock_data = dr.data.DataReader(\n 'IBM', data_source='yahoo', start=start_date, end=end_date)\n\n\n# Inspect the top 10 records of the retreived IBM stock market data:\n\n#%%\n\nstock_data.head(10)\n\n\n# Let's also evaluate the data quality of the download by creating a set of summary statistics of the retrieved data:\n\n#%%\n\nstock_data.describe()\n\n\n# Visually inspect the daily closing prices of the \"International Business Machines\" (IBM) stock market data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['Close'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set x-axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\n\n# set y-axis labels and limits\nax.set_ylabel('[stock closing price]', fontsize=10)\nax.set_ylim(20, 220)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# Save the obtained and validated stock market data to the local data directory:\n\n#%%\n\n# save retrieved data to local data directory\nstock_data.to_csv('data/ibm_data_2010_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 2. Data Pre-Processing\n\n# In this section, we will obtain daily returns of the retrieved daily closing prices. Also, we will convert the time-series of daily returns into a set of sequences $s$ of $n$ time steps respectively. The created sequences will then be used to learn a model using an Long Short-Term Memory neural network.\n\n# #### 2.1 Weekend and Holiday Padding\n\n# Let's always forward propagate the last valid available price information observation to the next available valid price information using the Panda's `reindex()` function. This in order to also obtain market price information of weekend's and holidays:\n\n#%%\n\n# fill weekends and holidays\nstock_data = stock_data.reindex(index=pd.date_range(\n stock_data.index.min(), stock_data.index.max()), method='ffill')\n\n\n# Inspect the padded stock market data of the \"International Business Machines\" (IBM) stock:\n\n#%%\n\nstock_data.head(10)\n\n\n# Inspect the number of records obtained after the data padding:\n\n#%%\n\nstock_data.shape\n\n\n# #### 2.2 Daily Return Calculation\n\n# Determine the daily returns of the \"International Business Machines\" (IBM) daily closing prices using the Panda's `pct_change()` function:\n\n#%%\n\nstock_data['RETURN'] = stock_data['Close'].pct_change()\n\n\n# Inspect the daily returns of the closing prices:\n\n#%%\n\nstock_data['RETURN']\n\n\n# Visually inspect the obtained daily returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot reconstruction error scatter plot\nax.plot(stock_data.index, stock_data['RETURN'], color='#9b59b6')\n\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Closing Prices', fontsize=10)\n\n\n# #### 2.3 Conduct Train-Test Split for Neural Network Training\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** or **\"in-sample\"** data (the fraction of data records solely used for training purposes) and a **evaluation set** or **\"out-of-sample\"** data (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the split fraction of training sequences to **90%** of the total number of obtained sequences:\n\n#%%\n\nsplit_fraction = 0.9\nsplit_row = int(stock_data.shape[0] * split_fraction)\n\n\n# Split obtained returns into training (\"in-sample\") returns $r^{i}_{train}$ and validation (\"out-of-sample\") returns $r^{i}_{valid}$:\n\n#%%\n\ntrain_stock_data = stock_data.iloc[:split_row]\nvalid_stock_data = stock_data.iloc[split_row:]\n\n\n# Visually inspect the obtained train and validation stock returns:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot daily stock returns\nax.plot(stock_data.index[:split_row, ],\n train_stock_data['RETURN'], c='C0', label='train')\nax.plot(stock_data.index[split_row:, ],\n valid_stock_data['RETURN'], c='C1', label='valid')\n\n# rotate x-labels 45 degree angle\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels and limits\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([pd.to_datetime('01-01-2000'), pd.to_datetime('31-12-2017')])\nax.set_ylabel('[daily stock returns]', fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('International Business Machines (IBM) - Daily Historical Stock Returns', fontsize=10)\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{train}$:\n\n#%%\n\ntrain_stock_data.shape\n\n\n# Determine count (shape) of daily return train sequences $r^{i}_{valid}$:\n\n#%%\n\nvalid_stock_data.shape\n\n\n# #### 2.4 Transform Time-Series Into Sequences\n\n# In the following, we determine the number of return time-steps $n$ each individual sequence $s^{i}$ should be comprised of. Each sequence is thereby determined by the number of predictor (return) time-steps $t$ and the prediction (return) horizon $h = t+1$.\n\n# \n\n# In this example, we will set the number of predictor (return) time-steps to $t$=4. This indicates that the input sequence of each sample is a vector of 4 sequential daily stock returns (pls. note, the choice of $t$=4 is arbitrary and should be selected through experimentation). Furthermore, we set the predicted return horizon to 1, which specifies that we aim to forecast a single future time-step.\n\n#%%\n\ntime_steps = 4 # number of predictor timesteps\nhorizon = 1 # number of timesteps to be predicted\nsequence_length = time_steps + horizon # determine sequence length\n\n\n# Next, we extract the sequences $s^i$ of 5 time-steps.\n#\n# Thereby, we will step-wise iterate (\"rolling window\") over the entire sequence of daily stock returns $r_i$. In each iteration step, we extract an individual sequence of stock returns consisting of $n$ time-steps. The extracted individual sequences of daily closing prices are then collected in a single data frame.\n\n# \n\n# Determine the max number of training (\"in-sample\") sequences:\n\n#%%\n\n# determine max train index\nmax_train_index = (\n (train_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual training sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the training dataset\nfor i in range(1, max_train_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.array(\n train_stock_data.index[i:i + sequence_length].T)\n train_stock_sequence_data = np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n train_stock_sequence_data_date = np.vstack((train_stock_sequence_data_date, np.array(\n train_stock_data.index[i:i + sequence_length].T)))\n train_stock_sequence_data = np.vstack((train_stock_sequence_data, np.array(\n train_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained training sequences:\n\n#%%\n\ntrain_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of training timesteps:\n\n#%%\n\ntrain_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of training returns $s^{i}_{train}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\ntrain_stock_sequence_data[0:5, ]\n\n\n# Determine the max number of validation (\"out-of-sample\") sequences:\n\n#%%\n\n# determine max valid index\nmax_valid_index = (\n (valid_stock_data.shape[0] // sequence_length) - 1) * sequence_length\n\n\n# Extract individual validation sequences of length $5$ from the obtained daily returns:\n\n#%%\n\n# iterate over the distinct daily returns of the validation dataset\nfor i in range(1, max_valid_index):\n\n # case: initial sequence\n if i == 1:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.array(\n valid_stock_data.index[i:i + sequence_length].T)\n valid_stock_sequence_data = np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)\n\n # case: non-initial sequence\n else:\n\n # convert to numpy array and collect sequence of timesteps and daily returns\n valid_stock_sequence_data_date = np.vstack((valid_stock_sequence_data_date, np.array(\n valid_stock_data.index[i:i + sequence_length].T)))\n valid_stock_sequence_data = np.vstack((valid_stock_sequence_data, np.array(\n valid_stock_data['RETURN'][i:i + sequence_length].T)))\n\n\n# Determine the total number of obtained validation sequences:\n\n#%%\n\nvalid_stock_sequence_data.shape\n\n\n# Inspect the top five collected sequences of validation timesteps:\n\n#%%\n\nvalid_stock_sequence_data_date[0:5, ]\n\n\n# Inspect the top five collected sequences of validation returns $s^{i}_{valid}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$:\n\n#%%\n\nvalid_stock_sequence_data[0:5, ]\n\n\n# #### 2.4 Conduct Input-Target Split for Neural Network Training\n\n# Before we continue the date pre-processing, let's briefly revisit how RNN's or, more specifically, LSTM based NN's can be trained to predict the next element of an input sequence. The cartoon below is derived from the \"Next Word Predictor\" Example that we also discussed in the course. For each **input return** $r_{i}$ of the input return training sequence $s^i$ the LSTM is supposed to learn to **predict the return** of the next time-step $\\hat{r}_{i+1}$. In order to make such a future return $\\hat{r}_{i+1}$ prediction the LSTM uses it's learned hidden state information $h_{i}$ as well as the current return $r_{i}$ as an input.\n#\n# For each time-step the predicted return $\\hat{r}_{i+1}$ is then compared to the **target return** $r_{i+1}$. The discrepancy between both is collected as a loss $\\mathcal{L}$ for the distinct timesteps. The accumulation of the individual time-step losses is accumulated as the total loss of a sequence $\\mathcal{L}_{All}$.\n\n# \n\n# Seperate each training sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{train, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{train, target}=r_{t+1}$.\n\n# \n\n# In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\ntrain_sequences_input = torch.from_numpy(\n train_stock_sequence_data[:, :-1]).float()\ntrain_sequences_target = torch.from_numpy(\n train_stock_sequence_data[:, 1:]).float()\n\n\n# Seperate each validation sequence $s^{i}$ into time-steps of input returns denoted by $s^{i}_{valid, input}=\\{r_{t-n-1}, ..., r_{t-1}, r_{t}\\}$ and the time-step of the to be predicted target return denoted by $s^{i}_{valid, target}=r_{t+1}$. In addition, we convert both the input returns as well as the target returns to PyTorch tensors:\n\n#%%\n\nvalid_sequences_input = torch.from_numpy(\n valid_stock_sequence_data[:, :-1]).float()\nvalid_sequences_target = torch.from_numpy(\n valid_stock_sequence_data[:, 1:]).float()\n\n\n# To train an LSTM neural network, we tailor the dataset class provided by the PyTorch library. We overwrite the individual functions of the dataset class. So that our dataset will supply the neural network with the individual training sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$ throughout the training process:\n\n#%%\n\n# define daily returns dataset\nclass DailyReturnsDataset(data.Dataset):\n\n # define the class constructor\n def __init__(self, sequences, targets):\n\n # init sequences and corresponding targets\n self.sequences = sequences\n self.targets = targets\n\n # define the length method\n def __len__(self):\n\n # returns the number of samples\n return len(self.targets)\n\n # define the get item method\n def __getitem__(self, index):\n\n # determine single sequence and corresponding target\n sequence = self.sequences[index, :]\n target = self.targets[index, :]\n\n # return sequences and target\n return sequence, target\n\n\n# Once we have specified the daily returns dataset class we instantiate it using the new daily closing dataset using the prepared training input sequences $s^{i}_{train, input}$ and corresponding targets $s^{i}_{train, target}$:\n\n#%%\n\ntrain_dataset = DailyReturnsDataset(\n train_sequences_input, train_sequences_target)\n\n\n# Let's see how it works by getting the 42th sequence and its corresponding targets:\n\n#%%\n\ntrain_dataset.__getitem__(42)\n\n\n# ### 3. Neural Network Implementation and Loss Function\n\n# In this section, we will implement the LSTM architecture of the to be learned time series model. Furthermore, we will specify the loss-function, learning-rate and optimization technique used in the network training.\n\n# #### 3.1. Implementation of the LSTM Architecture\n\n# In this section, we will implement the architecture of the LSTM neural network utilized to predict future returns of financial time series data, e.g. as in this example, the future returns of a given stock. The neural network, which we name **'LSTMNet'** consists in total of three layers. The first two layers correspond to LSTM cells, while the third layer corresponds to a fully-connected linear layer.\n\n# \n\n# The general LSTM cell structure as well as the formal definition of its individual gate functions are shown in the following (not considering the bias of each layer for simplicity):\n\n# \n\n# (Source: https://pytorch.org/docs/stable/nn.html)\n\n# Each LSTM layer consits of a LSTM cell exhibiting a hidden state of 51 dimensions. The third linear squeezes the 51 hidden state dimensions of the second LSTM cell into a single output dimension. The single output signal of the linear layer refers to the return of the next time-step predicted by the neural network. Please note, that the choice of the implemented architecture and network hyperparameters is arbitrary and should in a real-world scenario be evaluated and selected thoroughly through experimentation.\n\n#%%\n\n# implement the LSTMNet network architecture\nclass LSTMNet(nn.Module):\n\n # define class constructor\n def __init__(self):\n\n super(LSTMNet, self).__init__()\n\n # define lstm nn architecture\n self.lstm1 = nn.LSTMCell(1, 51) # first lstm layer\n self.lstm2 = nn.LSTMCell(51, 51) # second lstm layer\n self.linear = nn.Linear(51, 1) # final linear layer\n\n # define network forward pass\n def forward(self, input):\n\n # init predictions\n predictions = []\n\n # init the lstm hidden states\n h_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n h_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # init the lstm cell states\n c_t1 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n c_t2 = torch.zeros(input.size(0), 51, dtype=torch.float).to(device)\n\n # iterate over distinct time steps\n for i, input_t in enumerate(input.chunk(input.size(1), dim=1)):\n\n # propagate through time step data\n h_t1, c_t1 = self.lstm1(input_t, (h_t1, c_t1))\n h_t2, c_t2 = self.lstm2(h_t1, (h_t2, c_t2))\n prediction = self.linear(h_t2)\n\n # collect predictions\n predictions += [prediction]\n\n # stack predictions\n predictions = torch.stack(predictions, 1).squeeze(2)\n\n # return predictions\n return predictions\n\n\n# Now, that we have implemented our first LSTM neural network we are ready to instantiate a model to be trained:\n\n#%%\n\nlstm_model = LSTMNet().to(device)\n\n\n# Once the model is initialized, we can visualize the model structure and review the implemented network architecture by execution of the following cell:\n\n#%%\n\n# print the initialized architectures\nprint('[LOG] LSTMNet architecture:\\n\\n{}\\n'.format(lstm_model))\n\n\n# Looks like intended? Great! Finally, let's have a look into the number of model parameters that we aim to train in the next steps of the notebook:\n\n#%%\n\n# init the number of model parameters\nnum_params = 0\n\n# iterate over the distinct parameters\nfor param in lstm_model.parameters():\n\n # collect number of parameters\n num_params += param.numel()\n\n# print the number of model paramters\nprint('[LOG] Number of to be trained LSTMNet model parameters: {}.'.format(num_params))\n\n\n# Ok, our \"simple\" `LSTMNet` model already encompasses an impressive number **32'284 model parameters** to be trained.\n\n# #### 3.2. Definition of the Training Loss Function and Learning Rate\n\n# We are now good to train the network. However, prior to starting the training, we need to define an appropriate loss function. Remember, we aim to train our model to learn a set of model parameters $\\theta$ that minimize the prediction error of the true return $r_{t+1}$ and the by the model predicted return $\\hat{r}_{t+1}$ at a given time-step $t+1$ of sequence $s^{i}$. In other words, for a given sequence of historic returns we aim to learn a function $f_\\theta$ that is capable to predicts the return of the next timestep as faithfully as possible, as expressed by:\n\n#
$\\hat{r}_{t+1} = f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})$.
\n\n# Thereby, the training objective is to learn a set of optimal model parameters $\\theta^*$ that optimize $\\min_{\\theta} \\|r_{t+1} - f_\\theta(r_{t}, r_{t-1}, ..., r_{t-n})\\|$ over all time-steps $t$ contained in the set of training sequences $s_{train}$. To achieve this optimization objective, one typically minimizes a loss function $\\mathcal{L_{\\theta}}$ while training the neural network. In this lab we use the **'Mean Squared Error (MSE)'** loss, as denoted by:\n\n#
$\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,
\n\n#%%\n\nloss_function = nn.MSELoss().to(device)\n\n\n# Throughout the training process, the PyTorch library will automatically calculate the loss magnitude, compute the gradient, and update the parameters $\\theta$ of the LSTM neural network. We will use the **\"Adaptive Moment Estimation Optimization\" (ADAM)** technique to optimize the network parameters. Furthermore, we specify a constant learning rate of $l = 1e-06$. For each training step, the optimizer will update the model parameters $\\theta$ values according to the degree of prediction error (the MSE loss).\n\n#%%\n\nlearning_rate = 1e-06 # set constant learning rate\n# define optimization technique\noptimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)\n\n\n# Now that we have successfully implemented and defined the three ANN building blocks let's take some time to review the `LSTMNet` model definition as well as the `MSE loss` function. Please, read the above code and comments carefully and don't hesitate to let us know any questions you might have.\n\n# ### 4. Training the Neural Network Model\n\n# In this section, we will train the LSTM neural network model (as implemented in the section above) using the prepared dataset of daily return sequences. Therefore, we will have a detailed look into the distinct training steps and monitor the training progress.\n\n# #### 4.1. Preparing the Network Training\n\n# Let's now start to learn a model by training the NN for **5 epochs** in mini-batches of the size of **128 sequences** per batch. This implies that the whole dataset will be fed to the network **5 times** in chunks of 128 sequences yielding to **32 mini-batches** (4'068 training sequences / 128 sequences per mini-batch) per epoch:\n\n#%%\n\n# specify the training parameters\nnum_epochs = 200 # number of training epochs\nmini_batch_size = 128 # size of the mini-batches\n\n\n# Furthermore, lets specify and instantiate a corresponding PyTorch data loader that feeds the image tensors to our neural network:\n\n#%%\n\ndl = dataloader.DataLoader(\n train_dataset, batch_size=mini_batch_size, shuffle=True)\n\n\n# #### 4.2. Running the Network Training\n\n# Finally, we start training the model. The training procedure of each mini-batch is performed as follows:\n#\n# >1. do a forward pass through the LSTMNet network,\n# >2. compute the mean-squared prediction error $\\mathcal{L}^{MSE}_{\\theta} (r_{t+1}, \\hat{r}_{t+1}) = \\frac{1}{N} \\sum_{i=1}^N \\| r_{t+1} - \\hat{r}_{t+1}\\|^{2}$,\n# >3. do a backward pass through the LSTMNet network, and\n# >4. update the parameters of the network $f_\\theta(\\cdot)$.\n#\n# To ensure learning while training the LSTM model we will monitor whether the loss decreases with progressing training. Therefore, we obtain and evaluate the mean prediction performance over all mini-batches in each training epoch. Based on this evaluation we can conclude on the training progress and whether the loss is converging (indicating that the model might not improve any further).\n#\n# The following elements of the network training code below should be given particular attention:\n#\n# >- `loss.backward()` computes the gradients based on the magnitude of the reconstruction loss,\n# >- `optimizer.step()` updates the network parameters based on the gradient.\n\n#%%\n\n# init collection of training epoch losses\ntrain_epoch_losses = []\n\n# set the model in training mode\nlstm_model.train()\n\n# init the best loss\nbest_loss = 100.00\n\n# iterate over epochs\nfor epoch in range(0, num_epochs):\n\n # init collection of mini-batch losses\n train_mini_batch_losses = []\n\n # iterate over mini-batches\n for sequence_batch, target_batch in dl:\n\n # push mini-batch data to computation device\n sequence_batch = sequence_batch.to(device)\n target_batch = target_batch.to(device)\n\n # predict sequence output\n prediction_batch = lstm_model(sequence_batch)\n\n # calculate batch loss\n batch_loss = loss_function(prediction_batch, target_batch)\n\n # run backward gradient calculation\n batch_loss.backward()\n\n # update network parameters\n optimizer.step()\n\n # collect mini-batch loss\n train_mini_batch_losses.append(batch_loss.data.item())\n\n # determine mean min-batch loss of epoch\n train_epoch_loss = np.mean(train_mini_batch_losses)\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} train-loss: {}'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n # determine mean min-batch loss of epoch\n train_epoch_losses.append(train_epoch_loss)\n\n # print epoch and save models\n if epoch % 10 == 0 and epoch > 0:\n\n # case: new best model trained\n if train_epoch_loss < best_loss:\n\n # store new best model\n model_name = 'best_lstm_model_{}.pth'.format(str(epoch))\n torch.save(lstm_model.state_dict(),\n os.path.join(\"./models\", model_name))\n\n # update best loss\n best_loss = train_epoch_loss\n\n # print epoch loss\n now = dt.datetime.utcnow().strftime(\"%Y%m%d-%H:%M:%S\")\n print('[LOG {}] epoch: {} new best train-loss: {} found'.format(str(now),\n str(epoch), str(train_epoch_loss)))\n\n\n# Upon successful training let's visualize and inspect the training loss per epoch:\n\n#%%\n\n# prepare plot\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# add grid\nax.grid(linestyle='dotted')\n\n# plot the training epochs vs. the epochs' prediction error\nax.plot(np.array(range(1, len(train_epoch_losses)+1)),\n train_epoch_losses, label='epoch loss (blue)')\n\n# add axis legends\nax.set_xlabel(\"[training epoch $e_i$]\", fontsize=10)\nax.set_ylabel(\"[Prediction Error $\\mathcal{L}^{MSE}$]\", fontsize=10)\n\n# set plot legend\nplt.legend(loc=\"upper right\", numpoints=1, fancybox=True)\n\n# add plot title\nplt.title('Training Epochs $e_i$ vs. Prediction Error $L^{MSE}$', fontsize=10)\n\n\n# Ok, fantastic. The training error is nicely going down. We could train the network a couple more epochs until the error converges. But let's stay with the 200 training epochs for now and continue with evaluating our trained model.\n\n# ### 5. Evaluation of the Trained Neural Network Model\n\n# In this section, we will conduct a visual comparison of the predicted daily returns to the actual ('true') daily returns. The comparison will encompass the daily returns of the in-sample time period as well as the returns of the out-of-sample time period.\n\n# #### 5.1. In-Sample Evaluation of the Trained Neural Network Model\n\n# Before starting our evaluation, let's load the best performing model or an already pre-trained model (as done below). Remember, that we stored a snapshot of the model after each training epoch to our local model directory. We will now load one of the (hopefully well-performing) snapshots saved.\n\n#%%\n\n# init the pre-trained model architecture\nlstm_model_pretrained = LSTMNet().to(device)\n\n# set the pre-trained model name we aim to load\nmodel_name_pretrained = 'best_lstm_model_30000.pth'\n\n# load the pre-trained model paramaters\nlstm_model_pretrained.load_state_dict(torch.load(os.path.join(\n \"./models\", model_name_pretrained), map_location=lambda storage, loc: storage))\n\n\n# Let's inspect if the model was loaded successfully:\n\n#%%\n\n# set model in evaluation mode\nlstm_model_pretrained.eval()\n\n\n# Use the pre-trained model to determine the daily return predictions of the **in-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n train_predictions = lstm_model_pretrained(train_sequences_input.to(device))\n\n # collect prediction batch results\n train_predictions_list = train_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n train_targets_list = train_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **in-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(train_stock_sequence_data_date[:, -1],\n train_targets_list, color='C1', label='groundtruth (green)')\nax.plot(train_stock_sequence_data_date[:, -1],\n train_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(train_stock_sequence_data_date[:, -1].min(),\n train_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title('LSTM NN In-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)\n\n# set axis labels\nplt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n\n# set axis ticks fontsize\nplt.xticks(fontsize=8)\nplt.yticks(fontsize=8)\n\n\n# #### 5.2. Out-of-Sample Evaluation of the Trained Neural Network Model\n\n# Use the pre-trained model to determine the daily return predictions of the **out-of-sample** sequence population:\n\n#%%\n\n# don't calculate gradients\nwith torch.no_grad():\n\n # predict sequence output\n valid_predictions = lstm_model_pretrained(valid_sequences_input.to(device))\n\n # collect prediction batch results\n valid_predictions_list = valid_predictions.cpu().detach().numpy()[\n :, -1].tolist()\n\n # collect target batch results\n valid_targets_list = valid_sequences_target.numpy()[:, -1].tolist()\n\n\n# Plot the pre-trained `LSTMNet` daily **out-of-sample** predictions vs. the target (\"ground-truth\") daily returns:\n\n#%%\n\n# plot the prediction results\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [15, 5]\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max())\n\n# set plot legend\nplt.legend(loc=\"lower right\", numpoints=1, fancybox=True)\n\n# set plot title\nplt.title(\n 'LSTM NN Out-of-Sample Prediction vs. Ground-Truth Market Prices', fontsize=10)", "original_comment": "# set axis labels\n", "target_code": "plt.xlabel('[time]', fontsize=8)\nplt.ylabel('[market price]', fontsize=8)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig = plt.figure()\nax = fig.add_subplot(111)\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_targets_list, color='C1', label='groundtruth (green)')\nax.plot(valid_stock_sequence_data_date[:, -1],\n valid_predictions_list, color='C0', label='predictions (blue)')\n# set y-axis limits\nax.set_xlim(valid_stock_sequence_data_date[:, -1].min(),\n valid_stock_sequence_data_date[:, -1].max\n", "model": "natural", "intent": "# set axis labels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n\ncr3.sort_values('index', inplace=True)\n\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom nilearn import image\nimport nibabel as ni\nimport HAP_Utils as hap\nimport sys\nimport pandas\nimport os\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom glob import glob\nfrom scipy import stats\nfrom sklearn import model_selection, linear_model\nfrom scipy import ndimage\nfrom sklearn.neighbors import kneighbors_graph\nfrom sklearn.metrics import calinski_harabaz_score\nfrom sklearn.metrics import silhouette_score\nfrom sklearn.cluster import AgglomerativeClustering, SpectralClustering\nfrom statsmodels.nonparametric.smoothers_lowess import lowess\n\n#%%\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ngit_dir = '/home/users/jvogel/git/Hippocampus_AP_Axis/'\n\nsys.path.insert(0, git_dir)\n\n\n# # Get hippocampus samples\n# Note: The spreadheets below are generated in NB1, or in the case of \"probes\", are straight from the Allen Brain Atlas dataset\n\n#%%\n\nwdir = os.path.join(git_dir, 'Data')\naba_dir = '/data1/users/jvogel/Allen_Human_Brain_Atlas/'\n\ndf = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_sample_info.csv'), index_col=0)\nxp = pandas.read_csv(os.path.join(\n wdir, 'MAIN_hippocampus_gxp.csv'), index_col=0)\ngdf = pandas.read_csv(os.path.join(wdir, 'MAIN_gcx_wholebrain_info.csv'))\nprobes = pandas.read_csv(os.path.join(\n aba_dir, 'normalized_microarray_donor9861/Probes.csv'))\n\n#%%\n\n# Uncomment if you want to save figures to disk\n# fig_dir =\n\n\n# # Run main analysis pipeline to get variables\n\n#%%\n\npcalr_out_NEW = hap.PCA_LR_pipeline(xp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n test_gene_num=[100],\n sanity_check_style='model')\n\n\n# # Examining the stability of model after removing genes\n\n#%%\n\n# THIS TAKES AWHILE -- I RUN IT OVERNIGHT, ALONG WITH THE NEXT ONE\n\n# RERUNNING MODEL AFTER REMOVING TOP 50 ANTERIOR\n# AND TOP 50 POSTERIOR PROBES\n\ncv_acc = []\nimp_genes = []\nfxp = pandas.DataFrame(xp, copy=True)\nktxp = pandas.DataFrame(xp, copy=True)\nfor i in range(589):\n print('round', i)\n # Run model\n jnk = hap.PCA_LR_pipeline(fxp.values.T,\n df.position_along_hipp_AP_axis,\n cv_strategy='score',\n test_gene_num=[50],\n sanity_check_style='model')\n # Store accuracy\n cv_acc.append(jnk['CV_scores'])\n\n # Drop 100 most important genes\n dropper = []\n [dropper.append(x) for x in jnk['gene_selections']['posterior_genes_50']]\n [dropper.append(x) for x in jnk['gene_selections']['anterior_genes_50']]\n\n # QC stuff\n if i < 50:\n [imp_genes.append(ktxp.index[x]) for x in dropper]\n fxp.drop(dropper, inplace=True)\n ktxp.drop(ktxp.index[dropper], inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# Plot change in accuracy over iterations\nsns.set_context('notebook')\nplotr = pandas.DataFrame(index=range(len(cv_acc)), columns=[\n 'Top Genes Removed', 'CV Accuracy'])\n#plotr.loc[:,'Top Genes Removed'] = range(100,5100,100)\nplotr.loc[:, 'Top Genes Removed'] = range(100, 58610, 100)\nplotr.loc[:, 'CV Accuracy'] = cv_acc\nplt.close()\nsns.factorplot(x='Top Genes Removed', y='CV Accuracy',\n data=plotr, aspect=1.5, size=5)\nplt.show()\n\n\n# Now repeat except remove 100 random probes instead of 100 top probes\n\n#%%\n\nnull_cv_acc = []\nfxp = pandas.DataFrame(xp, copy=True)\nfor i in range(585):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# LOOKS LIKE I DIDN'T TAKE IT ALL THE WAY...\nfor i in range(585, 589):\n print('round', i)\n jnk = PCA_LR_pipeline(fxp.values.T,\n (df.position_along_hipp_AP_axis),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n null_cv_acc.append(jnk['CV_scores'])\n\n dropper = np.random.randint(0, len(fxp.index), 100)\n fxp.drop(dropper, inplace=True)\n fxp.index = range(len(fxp.index))\n\n#%%\n\n# PUT IT ALL TOGETHER\nplotr = pandas.DataFrame(index=range(len(cv_acc)*2),\n columns=['Genes Removed', 'CV Accuracy (r2)',\n 'Category'])\n#plotr.loc[:,'Genes Removed'] = list(range(100,5100,100))*2\nplotr.loc[:, 'Genes Removed'] = list(range(100, 59000, 100))*2\nplotr.loc[:, 'CV Accuracy (r2)'] = cv_acc + null_cv_acc\n#plotr.loc[:,'Category'] = ['Top Genes' if x <50 else 'Random Genes' for x in range(100)]\nplotr.loc[:, 'Category'] = ['Top Genes' if x <\n 589 else 'Random Genes' for x in range(1178)]\nsns.set_context('poster')\n\n#%%\n\n# PLOT IT\nplt.close()\ng = sns.stripplot(x='Genes Removed', y='CV Accuracy (r2)', hue='Category',\n data=plotr)\nfor ind, label in enumerate(g.get_xticklabels()):\n # if ind % 10 == 0: # every 10th label is kept\n if ind % 20 == 0: # every 20th label is kept\n label.set_visible(True)\n label.set_rotation(90)\n else:\n label.set_visible(False)\n# plt.savefig(os.path.join(fig_dir,'CV_Acc_gene_removal_589.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n#%%\n\n# STOPPING POINTS OF EACH GENE SET\nstep1 = 100\nstep2 = 600\nstep3 = 2800\n\n#%%\n\n# SUMMARIZE INFORMATION OF TOP 5000 GENES...\nmod_genes = pandas.DataFrame(index=range(5000),\n columns=['probe_ind', 'ant-post', 'step'])\nmod_genes.loc[:, 'probe_ind'] = imp_genes\nmod_genes.loc[:, 'gene_symbol'] = probes.loc[mod_genes['probe_ind'].values,\n 'gene_symbol'].values\nmod_genes.loc[:, 'step'] = [1 if x <= step1 else 2 if x <=\n step2 else 3 if x <= step3 else 4 for x in range(5000)]\nant_ind, post_ind = [], []\nfor i in range(50):\n post_ind += (np.arange(50) + (100*i)).tolist()\n ant_ind += (np.arange(50, 100) + (100*i)).tolist()\nmod_genes.loc[post_ind, 'ant-post'] = 'posterior'\nmod_genes.loc[ant_ind, 'ant-post'] = 'anterior'\nmod_genes.head()\n\n#%%\n\nmod_genes.to_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'))\n\n#%%\n\nmod_genes = pandas.read_csv(os.path.join(wdir, 'MAIN_model_genes_of_importance.csv'),\n index_col=0)\nmod_genes.index = mod_genes.probe_ind\nmod_genes.head()\n\n\n# # Comparing individual gene sets in predicting A-P axis\n\n#%%\n\n# Get a Probe x Sample matrix for each \"gene set\"\ntop_100 = xp.loc[mod_genes[mod_genes.step == 1].index]\ntop_100_600 = xp.loc[mod_genes[mod_genes.step == 2].index]\ntop_600 = xp.loc[mod_genes[mod_genes.step < 3].index]\ntop_600_2700 = xp.loc[mod_genes[mod_genes.step == 3].index]\ntop_2700 = xp.loc[mod_genes[mod_genes.step < 4].index]\ntop_2700_5k = xp.loc[mod_genes[mod_genes.step == 4].index]\nt5k_ind = [x for x in xp.index if x not in mod_genes.probe_ind.values]\ntop_5k_end = xp.loc[t5k_ind]\nall_subs = xp.loc[mod_genes.loc[mod_genes.index].index]\n\n#%%\n\n# ID non-overlapping gene sets\ngene_sets = [top_100, top_100_600,\n top_600_2700, top_2700_5k,\n top_5k_end, all_subs]\n\n# Make empty dataframe\nhla_scores = pandas.DataFrame(index=range(180), columns=[\n 'score', 'model_type', 'sample'])\nhla_scores.loc[:, 'model_type'] = (\n ['model']*10 + ['null']*10 + ['inner set (100)']*10) * 6\nhla_scores.loc[:, 'sample'] = ['Set1 (n=100)']*30 + ['Set2 (n=500)']*30 + ['Set3 (n=1100)']*30 + [\n 'Set4 (n=2300)']*30 + ['Not in Sets1-4 (n=53k)']*30 + ['all (n=58k)']*30\n\n\n# For each non-overlapping gene-set, performing the following models:\n# * Perform 10 (bootstrapped) models using only probes in the gene set\n# * Perform 10 \"null\" models involving n-length sets of randomly selected probes where n equals the size of the gene set\n# * Perform 10 \"inner-set\" models involving 100 probes randomly selected from *within* the gene set.\n#\n# This latter comparison is to compare to other sets to Set 1, which contains only 100 probes.\n\n#%%\n\n# THIS ALSO TAKES QUITE A LONG TIME TO RUN\n\ni = 0\nfor g, gset in enumerate(gene_sets):\n if g == 0:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null', smallset=True)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100,\n smallset=True)\n i += 10\n else:\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis)\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='null')\n i += 10\n hla_scores.loc[i:i+9, 'score'] = hap.bootstrap_model(gset, xp,\n df.position_along_hipp_AP_axis,\n bs_type='inner_set', inner_set=100)\n i += 10\n\n\n# Plot it\n\n#%%\n\nsns.set_context('notebook')\nplt.close()\ng = sns.factorplot(x='sample', y='score', hue='model_type', data=hla_scores)\ng.set_xticklabels(g.ax.get_xticklabels(), rotation=90)\nplt.ylabel('Axis Position Accuracy')\n#plt.savefig(os.path.join(fig_dir,'model_comparison.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# SAVE THE GENE SETS TO DISK AND SAVE THE PREDICTED VALUES TO SPREADSHEET FOR USE LATER\ngsets = dict(zip(['top_100', 'top_100_600',\n 'top_600_2700', 'top_2700_5k',\n 'top_5k_end'], gene_sets[:-1]))\ngsets.update({'top_600': top_600})\ngsets.update({'top_2700': top_2700})\n\nfor gset, xpdf in gsets.items():\n print('working on', gset)\n if gset != 'top_100':\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis,\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n pca_tfm = otpt['pca_object'].transform(\n xpdf.values.T) # transform with PCA\n predicted = otpt['final_model'].predict(\n pca_tfm) # get predicted values\n else:\n otpt = hap.PCA_LR_pipeline(xpdf.T, df.position_along_hipp_AP_axis, pca=None,\n clf=linear_model.LassoCV(\n cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n predicted = otpt['final_model'].predict(xpdf.T) # get predicted values\n df.loc[:, gset] = predicted\n\n xpdf.to_csv(os.path.join(wdir, 'GSET_%s_gxp.csv' % gset))\n\n#%%\n\ndf.to_csv(os.path.join(wdir, 'MAIN_hippocampus_sample_info_W_SMALL_SETS.csv'))\n\n\n# ## Use LIME do to some Feature Explaining!\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100,\n df.position_along_hipp_AP_axis,\n probes)\n# Scroll all the way down for plots\n\n\n# #### How does the model do without those top genes, compared to removing five random genes?\n# This did not make it into the manuscript...\n\n#%%\n\n# ID selected probes\nimpgenes = ['RSPH9', 'FAM43B', 'FSTL4', 'NTN1', 'NR2F2']\nimp_ind = [x for x in top_100.index if probes.loc[x, 'gene_symbol'] in impgenes]\n\n# Make empty dataframe for results\nn_genes = len(imp_ind)\nimp_rem_res = pandas.DataFrame(index=range(n_iter+1),\n columns=['model', 'r2'])\n# Drop the selected probes\nc = 0\njnk = pandas.DataFrame(top_100, copy=True)\njnk.drop(imp_ind, inplace=True)\n# Run model without those probes\nnewmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n# Store results\nimp_rem_res.loc[c, 'model'] = 'True'\nimp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\nc += 1\n\n# Now repeat n times but selecting 5 random genes for removal\nn_iter = 100\nfor i in range(n_iter):\n dsamp = np.random.choice(top_100.index, size=n_genes, replace=False)\n jnk = pandas.DataFrame(top_100, copy=True)\n jnk.drop(dsamp, inplace=True)\n newmod = PCA_LR_pipeline(jnk.values.T,\n (df.position_along_hipp_AP_axis), pca=None,\n clf=linear_model.LassoCV(cv=10, max_iter=5000),\n cv_strategy='score', illustrative=False,\n sanity_check_style='model')\n imp_rem_res.loc[c, 'model'] = 'Null'\n imp_rem_res.loc[c, 'r2'] = newmod['CV_scores']\n c += 1\n\n\n# Visualize results\n\n#%%\n\nplt.close()\nsns.barplot(x='model', y='r2', data=imp_rem_res,)\nplt.ylim(0.7, 0.89)\nplt.show()\n\n#%%\n\nplt.close()\nsns.stripplot(x='model', y='r2', hue='model', data=imp_rem_res, jitter=True)\n#plt.ylim(0.7, 0.89)\nplt.show()\n\n\n# ## Find \"bigrams\" (similar features) to the \"most important\" features\n# This function will search all available probes that have collinear expression patterns to a target probe, and will return some information. This also didn't make it into the MS\n\n#%%\n\n# NR2F2\nhap.find_bigram(xp, 40112, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_600_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# RSPH9\nhap.find_bigram(xp, 23274, probes.gene_symbol, report=False, check_type='r2', check_val=0.5,\n check_genes=probes.loc[top_2700.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FAM43B\nhap.find_bigram(xp, 22547, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n#%%\n\n# FSTL4\nhap.find_bigram(xp, 29383, probes.gene_symbol, report=False, check_type='r2', check_val=0.4,\n check_genes=probes.loc[top_100.index, 'gene_symbol'].unique().tolist())\n\n\n# #### Let's do some \"Feature Explainers for other gene sets. Starting with Gene Set 2:\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_100_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene Set 3\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_600_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# #### Gene set 1 + 2 + 3 Combined\n\n#%%\n\npltr = hap.feature_explainer_pipeline(top_2700, df.position_along_hipp_AP_axis,\n probes, nm_thresh=0.2)\n\n\n# ## Viewing expression patterns of top genes\n\n#%%\n\n# Gene Set 1\n\n# Identify gene set\njnk = xp.loc[mod_genes[mod_genes.step == 1]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\n# Cluster and plot\nplt.close()\ng = sns.clustermap( # jnk,\n sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n#g.fig.savefig(os.path.join(fig_dir,'top100_cluster.pdf'), bbox_inches='tight')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 1 + 2\n\njnk = xp.loc[mod_genes[mod_genes.step < 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(\n jnk, 3, 1), index=jnk.index, columns=jnk.columns)\nplt.close()\ng2 = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 2 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 2]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng2a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\nplt.show()\n\n#%%\n\n# Repeat for Gene Set 3 alone\n\njnk = xp.loc[mod_genes[mod_genes.step == 3]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n index=probes.loc[jnk.index, 'gene_symbol'],\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)\nplt.close()\ng3a = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='RdBu_r')\n# g3a.fig.savefig('/home/users/jvogel/Science/Allen_Human_Brain_Atlas/figs/top6_2700_cluster.pdf')\nplt.show()\n\n\n# ## Making meaningful clusters out of gene/annotation relationships\n# For Gene Sets 2 and 3, there were many, many \"hits\" indicating enriched functions, processes and components. These analyses attempt to cluster the individual genes into clusters of shared enriched terms.\n\n# #### Gene Set 2\n\n#%%\n\n# Locate spreadsheets, which were generated using GOrilla\n\n# Specifically, the specific genes produced in Set 2 and 3 above we're passed to GOrilla, and all genes\n# available in the dataset were entered as background.\n\ngo_proc = os.path.join(wdir, 'GOPROCESS.xls')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT.xls')\ngo_func = os.path.join(wdir, 'GOFUNCTION.xls')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\n# Consolidate into a dataframe\ngo_gsea = hap.prepare_GO_terms(top_100_600, gos, probes)\ngo_gsea.head()\n\n#%%\n\n# Quick preview of what the clusters might look like\n\nplt.close()\nsns.clustermap(go_gsea, metric='jaccard', col_cluster=False)\nplt.show()\n\n#%%\n\n# Cluster using 2-30-cluster solutions and compare based on silhouette scores and CH index\n\n# Create results dataframe\nks = range(2, 31)\ngo_solutions2 = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values.T\nfor k in ks:\n # Perform cluster analysis\n connectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions2.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions2.loc[k, 'CH_index'] = calinski_harabaz_score(\n X, cluster_labels)\n # Get information on cluster size\n mtx2 = pandas.DataFrame(go_gsea.T.values, copy=True)\n mtx2.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx2.label):\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions2.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions2.loc[k, 'min_size'] = np.min(sizes)\n go_solutions2.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions2.loc[:, 'k'] = go_solutions2.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions2)\nplt.show()\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='min_size', data=go_solutions2)\nplt.show()\n\n#%%\n\n# Run clustering based on using a solution of K=8\n\nsubfields = df.structure_acronym.unique()\nk = 8\n\n# Perform clustering and extract labels\nX = go_gsea.values.T\nconnectivity = kneighbors_graph(X, n_neighbors=10, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx2 = pandas.DataFrame(go_gsea.T, copy=True)\nmtx2.loc[:, 'label'] = cluster_labels\n\n# For each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Extract data from the cluster\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n print('n = ', len(clus_data))\n # Plot it\n plt.close()\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n\n # Identify the most frequently enriched terms in the cluster\n # and the percentage of genes in the cluster that show enrichment\n # for each term\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n # Print the most enriched genes\n print(clus_data.index.tolist())\n\n# # A bunch of extra exploratory stuff\n# plt.close()\n# # Identify which genes are posteriorly expressed vs. anteriorly expressed\n# # Good god this next line is hideous. I'm so sorry.\n# xp_mtx = g2.data2d.loc[[x for x in g2.data2d.index if x in probes[probes.gene_symbol.isin(top_hits.index)\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# # Print the number of anterior vs posterior genes\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# # And which they are\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# # Store the cluster's association with axis position\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# # And separately for each subfield\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# # Plot the raw expression patterns of the anterior and posterior genes in the cluster\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n# #for x in clus_data.index:\n# # print(x)\n# print('\\n\\n')\n\n\n# #### Pause for a revision: Which go clusters explain regional disease vulnerability?\n# This is jumping ahead to NB7, but it's easier to place the code here. For this to make sense, you may have to run NB7 and come back here (but the code after this section can be run without running this section). Unlike rsfmri connectivity and structural covariance to the hippocampus, disease vulnerability was not associated with HAGGIS composed of only Set1 features (i.e. the top 100 features of our model). We decided to look to see if specific GO clusters in Sets 2 and 3 (which were associated disease vulnerability) could explain the relationship between HAGGIS and disease, so we can compare them to the GO terms enrisched in Set1\n\n#%%\n\nshtz = sorted(glob(os.path.join(\n aba_dir, 'normalized_microarray_donor*/MExp_all_genes_ctr_for_donor')))\nholder = []\nfor sht in shtz:\n holder.append(pandas.read_csv(sht, index_col=0))\nbigdf = pandas.concat(holder, axis=1)\ndel(holder)\nncols = ['%s_%s' % (gdf.loc[x, 'donor'], bigdf.columns[x])\n for x in range(gdf.shape[0])]\nbigdf.columns = ncols\n\n\n# This runs a bunch of code that is explained much more slowly in NB7. It's copied exactly from NB7. Here we're running it all at once to get the desired outputs, namely sample-wise values for HAGGIS and disease expression\n\n#%%\n\n# CHANGE THIS TO YOUR FDG DIR\nfdg_dir = '/home/users/jvogel/Science/Allen_Human_Brain_Atlas/AD-FTD FDG Difference Map/'\n\n# gather all the images\nmni2mm = ni.load(os.path.join(wdir, 'MNI152_T1_2mm_brain.nii.gz'))\nd_diff = ni.load(os.path.join(fdg_dir, 'AD-FTD_globalnorm2.nii.gz'))\nHO = ni.load(os.path.join(wdir, 'HarvardOxford-sub-maxprob-thr25-1mm.nii.gz'))\ndkt = ni.load(os.path.join(wdir, 'dkt_atlas_1mm.nii.gz'))\ndiffs = {'dis': d_diff, 'dkt': dkt, 'HO': HO}\n\n# bring them to a common spac\ndiffs_2mm = {}\nfor lab, diff in diffs.items():\n print(lab, diff.shape)\n if lab == 'HO' or lab == 'dkt':\n nimg = image.resample_to_img(diff, mni2mm, interpolation='nearest')\n else:\n nimg = image.resample_to_img(diff, mni2mm)\n print('new shape', nimg.shape)\n diffs_2mm.update({lab: nimg})\n\n# make the brainmasks\n\ndkt2 = diffs_2mm['dkt'].get_data()\nHO2mm = diffs_2mm['HO'].get_data()\nmsk1 = np.array(HO2mm, copy=True)\nmsk1[HO2mm < 1] = 0\n# no cereb or brainstem\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\nmsk2 = np.array(dkt2, copy=True)\nmsk2[msk2 > 78] = 0\n\n# no cereb, brainstem or hippocampus\nmsk3 = np.array(msk2, copy=True)\nmsk3[dkt2 == 36] = 0\nmsk3[dkt2 == 75] = 0\n\n# get association with disease vulnerability\n# for samples within mask\n\nimg = diffs_2mm['dis'].get_data()\nvrad = 3\nvdim = 2\ngcx_col = 'AP_axis_gxp_signature_similarity_SPCR'\nbootstrap = False\nplabs = ['No brainstem, cerebellum or hippocampus',\n 'HAGGIS expression', 'FTD > AD vulnerability']\nres, vex = hap.run_gvfcx_analysis(img, gdf, msk3, vrad, vdim, gcx_col, plabs,\n bootstrap, n_iter=10, hue_vals=[], illustrative=True,\n joint_input='')\n\nmcoords_idx = []\nfor i, row in gdf.iterrows():\n coord = hap.convert_coords([row['mni_nlin_x'], row['mni_nlin_y'], row['mni_nlin_z']],\n 'xyz', vdim)\n coord = [round(x) for x in coord]\n # if msk[coord[0],coord[1],coord[2]] != 0:\n if msk3[coord[0], coord[1], coord[2]] > 0:\n mcoords_idx.append(i)\n\n\n# For each of the 8 clusters above, we will assign a \"cluster centrality\" weight to each gene in Set2, where the weight is determined by the percentage of cluster-specific enriched terms that gene is associated with. In this way, most genes will have a weight of 0. We then create a weighted mean of expression of all genes in Set2, weighted by cluster centrality. This creates a \"cluster score\", which we store.\n#\n# We also repeat this process 100 times but this time randomly shuffling the cluster centrality weights to create a null model. For each null model, we find the correlation between sample cluster score and sample disease vulnerability score, creating a null distribution to test against for the analysis below.\n\n#%%\n\n# WARNING: This take awhile to run\n\n# Create empty dataframe to store null associations\nn_iter = 100\nnulls = np.zeros((len(mtx2.label.unique())*2, n_iter))\n\n# Iterate through each cluster\nfor i in np.unique(mtx2.label):\n print('cluster', i)\n # Get cluster centrality\n weighter = pandas.DataFrame(columns=['weight', 'AP'])\n clus_data = mtx2[mtx2.label == i][mtx2.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n # separate into anterior and posterior genes\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n # find weighted mean (aka cluster score)\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set2_C%s_%s' % (i, ap)] = cluster_score # save it\n # iterate through this process n_iter times, shuffle weights, find association, repeat\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls[row_idx, p] = r2\n\n#%%\n\n# Add titles for clusters\ncr2.sort_values(['direction', 'cluster'], inplace=True)\ncr2.loc[:, 'Title'] = ['C0: Amine Processing',\n 'C1: Axon Guidance',\n 'C2: GABA Activity',\n 'C3: Hormonal Signaling',\n 'C4: Neuropeptide Activity',\n 'C5: Ion Transport',\n 'C6: LH Secretion',\n 'C7: Growth Factor Signaling'] * 2\n\n#%%\n\n# Plot r2 between each cluster score and disease vulenerability,\n# separately for anterior and posterior genes. Also plot gray bars\n# representing the upper 95% confidence interval of the null model.\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr2, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr2.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr2.loc[i, 'null'], cr2.loc[i, 'null']], 'gray')\n if cr2.loc[i, 'null'] < cr2.loc[i, 'r2']:\n plt.text(row_idx-0.17, cr2.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.17)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n# plt.savefig(os.path.join(fig_dir,'Disease_CR2.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# ### Looks great! Repeat with all of that with Set 3!\n\n#%%\n\ngo_proc = os.path.join(wdir, 'GOPROCESS_6_2700.xlsx')\ngo_comp = os.path.join(wdir, 'GOCOMPONENT_6_2700.xlsx')\ngo_func = os.path.join(wdir, 'GOFUNCTION_6_2700.xlsx')\ngos = [go_proc, go_comp, go_func]\n\n#%%\n\ngo_gsea = hap.prepare_GO_terms(top_600_2700, gos, probes)\ngo_gsea.head()\n\n#%%\n\nks = range(2, 50)\ngo_solutions = pandas.DataFrame(index=ks, columns=['silhouette', 'CH_index',\n 'mean_size', 'min_size', 'max_size'])\nX = go_gsea.values\nfor k in ks:\n connectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\n clusterer = AgglomerativeClustering(\n n_clusters=k, connectivity=connectivity)\n cluster_labels = clusterer.fit_predict(X)\n # silhouette\n go_solutions.loc[k, 'silhouette'] = silhouette_score(X, cluster_labels)\n # CH Index\n go_solutions.loc[k, 'CH_index'] = calinski_harabaz_score(X, cluster_labels)\n mtx = pandas.DataFrame(go_gsea.values, copy=True)\n mtx.loc[:, 'label'] = cluster_labels\n sizes = []\n for i in np.unique(mtx.label):\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n sizes.append(len(clus_data))\n # SIZES\n go_solutions.loc[k, 'mean_size'] = np.mean(sizes)\n go_solutions.loc[k, 'min_size'] = np.min(sizes)\n go_solutions.loc[k, 'max_size'] = np.max(sizes)\n print('finished', k)\ngo_solutions.loc[:, 'k'] = go_solutions.index\n\n#%%\n\nplt.close()\nsns.factorplot(x='k', y='silhouette', data=go_solutions)\nplt.show()\n\n#%%\n\nk = 12 # based on silhouette and perc_hits\nX = go_gsea.T.values\nconnectivity = kneighbors_graph(X, n_neighbors=100, mode='distance',\n metric='jaccard', include_self=False)\nclusterer = AgglomerativeClustering(n_clusters=k, connectivity=connectivity)\ncluster_labels = clusterer.fit_predict(X)\nmtx = pandas.DataFrame(go_gsea.T, copy=True)\nmtx.loc[:, 'label'] = cluster_labels\nfor i in np.unique(mtx.label):\n plt.close()\n print('cluster', i)\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n print('n = ', len(clus_data))\n sns.heatmap(pandas.DataFrame(clus_data),\n cmap='RdBu_r')\n plt.show()\n top_hits = (clus_data.sum()/go_gsea.T.sum()\n ).sort_values(ascending=False).head(20)\n print(top_hits)\n print(clus_data.index.tolist())\n # for x in clus_data.index:\n # print(x)\n print('\\n\\n')\n\n# xp_mtx = g3.data2d.loc[[x for x in g3.data2d.index if x in probes[probes.gene_symbol.isin(clus_data.index.tolist())\n# ].index]].reindex(columns=xp.columns)\n# rs = [stats.pearsonr(xp_mtx.loc[i],df.position_along_hipp_AP_axis.values)[0] for i in xp_mtx.index]\n# ap = np.array(['a' if x > 0 else 'p' for x in rs])\n# amtx = xp_mtx.loc[xp_mtx.index[ap=='a']]\n# pmtx = xp_mtx.loc[xp_mtx.index[ap=='p']]\n# print('%s anterior, %s posterior'%(len(amtx),len(pmtx)))\n# print('anterior:', probes.loc[amtx.index,'gene_symbol'].unique())\n# print('posterior:', probes.loc[pmtx.index,'gene_symbol'].unique())\n# phdf = pandas.DataFrame(df,copy=True)\n# phdf.loc[:,'a_gxp'] = amtx.mean().values\n# phdf.loc[:,'p_gxp'] = pmtx.mean().values\n# ar = stats.pearsonr(phdf.a_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# pr = stats.pearsonr(phdf.p_gxp,phdf.position_along_hipp_AP_axis)[0]**2\n# print('anterior r2 = %s, posterior r2 = %s'%(ar,pr))\n# for subfield in subfields:\n# ars = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['a_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# prs = stats.pearsonr(phdf[phdf.structure_acronym==subfield]['p_gxp'],\n# df[df.structure_acronym==subfield].position_along_hipp_AP_axis)[0]**2\n# print('%s anterior = %s, posterior = %s'%(subfield,ars,prs))\n\n# sns.heatmap(amtx,cmap='RdBu_r')\n# plt.show()\n# plt.close()\n# sns.heatmap(pmtx,cmap='RdBu_r')\n# plt.show()\n\n#%%\n\n# WARNING: THIS TAKES A VERRRRY LONG TIME TO RUN!\n\nn_iter = 100\nnulls2 = np.zeros((len(mtx.label.unique())*2, n_iter))\nfor i in np.unique(mtx.label):\n print('cluster', i)\n weighter = pandas.DataFrame(columns=['weight'])\n clus_data = mtx[mtx.label == i][mtx.columns[:-1]]\n top_hits = (clus_data.sum()/go_gsea.T.sum()).sort_values(ascending=False)\n for g in top_hits.index:\n inds = probes[probes.gene_symbol == g].index\n for ind in inds:\n if ind in mod_genes.index:\n weighter.loc[ind, 'weight'] = top_hits[g]\n weighter.loc[ind, 'AP'] = mod_genes.loc[ind, 'ant-post']\n for ap in ['anterior', 'posterior']:\n row_idx = i*2\n if ap == 'posterior':\n row_idx += 1\n wdf = weighter[weighter.AP == ap]\n X = bigdf.loc[wdf.index].T\n cluster_score = (X * wdf.weight.values).mean(1).values\n gdf.loc[:, 'Set3_C%s_%s' % (i, ap)] = cluster_score\n print('finding nulls')\n for p in range(n_iter):\n np.random.shuffle(wdf.weight.values)\n cluster_score = (X * wdf.weight.values).mean(1).values\n r2 = stats.pearsonr(cluster_score[mcoords_idx],\n np.array(vex['cx_vector']))[0]**2\n nulls2[row_idx, p] = r2\n\n#%%\n\ncr3.sort_values(['direction', 'cluster'], inplace=True)\ncr3.loc[:, 'Title'] = ['C0: Peptide Antigen Binding',\n 'C1: Amine Transport',\n 'C2: Response to Cu ions',\n 'C3: Anion Transporter Activity',\n 'C4: Cell Motility',\n 'C5: Serotonin Binding',\n 'C6: GABA Activity',\n 'C7: Vascular Growth Factor Activity',\n 'C8: Signal Transduction',\n 'C9: K Channel Activity',\n 'C10: Phosphorylation',\n 'C11: Lipid Transport'] * 2\n\n#%%\n\ncr3.sort_values('index', inplace=True)\n\n#%%\n\nsns.set_context('notebook', font_scale=2)\nplt.close()\ng = sns.barplot(x='Title', y='r2', hue='direction',\n data=cr3, palette=['orange', 'blue'])\nplt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\nfor i in cr3.index:\n row_idx = ((i+0.0001) / 2)\n if i % 2 == 0:\n row_idx -= 0.2\n else:\n row_idx -= 0.3\n plt.plot([row_idx-0.1, row_idx+0.1],\n [cr3.loc[i, 'null'], cr3.loc[i, 'null']], 'gray')\n if cr3.loc[i, 'null'] < cr3.loc[i, 'r2']:\n plt.text(row_idx-0.25, cr3.loc[i, 'r2'] + 0.005, '*')\nplt.ylim(0, 0.15)\ng.set_xticklabels(g.get_xticklabels(), rotation=90)\n\n# plt.savefig(os.path.join(fig_dir,'Disease_CR3.pdf'),\n# bbox_inches='tight')\nplt.show()\n\n\n# # Revision: Look at types of gene distributions\n#\n# The objective here is to see if there are different types of expression patterns along the hippocampal longitudinal axis besides linear gradients, and to see what the distribution of expression patterns is for each gene set. We will cluster all genes in Sets 1-4 and then examine the expression clusters and cluster membership within each gene set.\n\n#%%\n\n# GATHER ALL GENES\n\n# Identify gene set\njnk = xp.loc[mod_genes[(mod_genes.step < 5) # &(mod_genes['ant-post']=='anterior')\n ]['probe_ind'],\n xp.columns[df.sort_values('position_along_hipp_AP_axis').index]]\n\n# # Smooth the data along X (axis position) with a 3mm kernel for easier viewing/clustering\nsjnk = pandas.DataFrame(ndimage.gaussian_filter1d(jnk, 3, 1),\n #index = probes.loc[jnk.index,'gene_symbol'],\n index=jnk.index,\n columns=df.sort_values('position_along_hipp_AP_axis').position_along_hipp_AP_axis)", "original_comment": "# Cluster and plot\n", "target_code": "g = sns.clustermap(sjnk,\n col_cluster=False, metric='correlation', standard_scale=0,\n cmap='Reds')\nplt.show()\n", "project_metadata": {"full_name": "illdopejake/Hippocampus_AP_Axis", "description": "Code used for Hippocampus Anterior/Posterior gene expression and neuroimaging analyses ", "topics": [], "git_url": "git://github.com/illdopejake/Hippocampus_AP_Axis.git", "stars": 7, "watchers": 7, "forks": 1, "created": "2018-05-20T18:18:47Z", "size": 149297, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 20340748, "Python": 58444, "Shell": 2454}, "last_updated": "2020-12-20T09:17:56Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "g = sns.clustermap(sjnk,\n row_cluster=False,\n col_cluster=False,\n row_linkage=None,\n col_linkage=None,\n cmap='RdBu_r',\n linewidths=.5,\n figsize=(10, 10),\n vmin=0,\n vmax=1,\n cbar_kws={'label': 'Normalized Expression'})\nplt.tight_layout()\nplt.savefig(os.path.join(fig_dir, 'SJNK.pdf'),\n bbox_inches='\n", "model": "natural", "intent": "# Cluster and plot"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n\n# should be version 1.x\nprint(tf.__version__)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\nfrom tensorflow.keras.backend import set_session\nimport re\nimport os\nimport scipy\nimport pickle\nimport cooltools as ct\nimport cooler\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras import layers\nfrom tensorflow.python.framework import ops\nimport math\nimport h5py\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport tensorflow as tf\nfrom cooltools.lib.numutils import set_diag\nfrom Bio import SeqIO\n\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.preprocessing import OneHotEncoder\n\nfrom models import advanced_2d_cnn\n\nimport pandas\n\npandas.set_option('display.max_columns', 500)\npandas.set_option('display.max_rows', 500)\n\n#%%\n\n# should be version 1.x\nprint(tf.__version__)\n\n#%%", "original_comment": "# the following directive activates inline plotting\n", "target_code": "get_ipython().run_line_magic('matplotlib', 'inline')\n", "project_metadata": {"full_name": "NeilAlishev/HiCPredictor", "description": "Predict Hi-C maps from the DNA sequence using deep convolutional neural networks", "topics": [], "git_url": "git://github.com/NeilAlishev/HiCPredictor.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-01-12T17:39:25Z", "size": 25045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 9881823, "Python": 17479}, "last_updated": "2020-11-13T16:32:28Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "plt.style.use('ggplot')\n", "model": "docstring", "intent": "# activate inline plotting"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n\n# View one of the records\ndata.take(1)\n\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n\nlr_model = lr.fit(train)\n\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Logistic Regression with PySpark\n\n# This notebook demonstrates how to train and measure a logistic regression model with PySpark.\n#\n# * Method: [Logistic Regression](https://spark.apache.org/docs/2.2.0/mllib-linear-methods.html#logistic-regression)\n# * Dataset: Spark MLlib Sample LibSVM Data\n\n# ## Imports\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql import SQLContext\nfrom pyspark import SparkContext\nimport numpy as np\nimport findspark\nfrom os import environ\n# Set SPARK_HOME\nenviron[\"SPARK_HOME\"] = \"/home/students/spark-2.2.0\"\n\nfindspark.init()\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Get Some Context\n\n#%%\n\n# Create a SparkContext and a SQLContext context to use\nsc = SparkContext(appName=\"Logistic Regression with Spark\")\nsqlContext = SQLContext(sc)\n\n\n# ## Load and Prepare the Data\n\n#%%\n\nDATA_FILE = \"/home/students/data/mllib/sample_libsvm_data.txt\"\n\n#%%\n\ndata = sqlContext.read.format(\"libsvm\").load(DATA_FILE)\n\n#%%\n\n# View one of the records\ndata.take(1)\n\n#%%\n\n# Create train and test datasets\nsplits = data.randomSplit([0.8, 0.2], 42)\ntrain = splits[0]\ntest = splits[1]\n\n\n# ## Fit a Logistic Regression Model\n#\n# Arguments:\n# * maxIter: max number of iterations\n# * regParam: regularization parameter\n# * elasticNetParam: ElasticNet mixing param\n# * 1 = L1 Regularization (LASSO)\n# * 0 = L2 Regularization (Ridge)\n# * Between 0 and 1 = ElasticNet (L1 + L2)\n\n#%%\n\nlr = LogisticRegression(maxIter=10,\n regParam=0.3,\n elasticNetParam=0.8)\n\n#%%\n\nlr_model = lr.fit(train)\n\n#%%\n\n# Show the intercept\nprint(\"Intercept: \" + str(lr_model.intercept))\n\n\n# ## Create Predictions\n\n#%%\n\n# Create the predictions\npredictions = lr_model.transform(test)\npredictions.show(5)\n\n#%%\n\n# Plot the actuals versus predictions\nactuals = predictions.select('label').collect()\npredictions = predictions.select('prediction').collect()\n\nfig = plt.figure(figsize=(10, 5))\nplt.scatter(actuals, predictions)\nplt.xlabel(\"Actuals\")\nplt.ylabel(\"Predictions\")\nplt.title(\"Actuals vs. Predictions\")\nplt.show()\n\n\n# ## Model Evaluation\n\n#%%\n\n# Create the summary\nmetrics = lr_model.summary\n\n\n# ### Area Under ROC\n#\n# A measure of how well a parameter can distinguish between the two groups in a binary classification.\n#\n# * .90-1 = excellent (A)\n# * .80-.90 = good (B)\n# * .70-.80 = fair (C)\n# * .60-.70 = poor (D)\n# * .50-.60 = fail (F)\n\n#%%", "original_comment": "# Area under the ROC\n", "target_code": "print(\"Area Under ROC = %.2f\" % metrics.areaUnderROC)\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "metrics = metrics.areaUnderROC\nprint(\"Area under the ROC curve: \" + str(metrics))\n", "model": "docstring", "intent": "# Area under the ROC"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n\ncelldom.get_repo_dir()\n\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n#%%\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n#%%\n\ncelldom.get_repo_dir()\n\n#%%\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n#%%\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n#%%\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n#%%\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n#%%\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n#%%", "original_comment": "# Test that the path can be parsed successfully\n", "target_code": "exp_config.parse_path(test_path)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "if not osp.exists(test_path):\n get_ipython().system('mkdir $test_path')\n", "model": "docstring", "intent": "# Test that the path can be parsed successfully"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n\ncombine[0]\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n#%%\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n#%%\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n#%%\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n#%%\n\ncombine[0]", "original_comment": "# instead we can reindex:\n", "target_code": "combine.index = range(combine.count())\n", "project_metadata": {"full_name": "ContextLab/CDL-tutorials", "description": "Repo containing useful tutorials on different topics, methods, software tools, and packages used by the CDL", "topics": ["tutorial", "training-materials", "python", "bayesian-methods", "package-creation", "scientific-computing"], "git_url": "git://github.com/ContextLab/CDL-tutorials.git", "stars": 12, "watchers": 12, "forks": 2, "created": "2017-12-15T13:36:50Z", "size": 59045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 694197, "Python": 17099, "TeX": 9149, "Makefile": 5644, "Batchfile": 5096, "Dockerfile": 3050, "Shell": 128}, "last_updated": "2020-07-13T19:39:57Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "combine.reindex(s1.index)\n", "model": "docstring", "intent": "# we can reindex:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # COVID-19 DETECTION FROM X-RAYS\n\n# This notebook builds and train pretrained resnet50 model for covid-19 detection from x-ray images. The dataset is curated by Dr. Joseph Cohen, a postdoctoral fellow at the University of Montreal. We collected the data from the following github repo: https://github.com/ieee8023/covid-chestxray-dataset. The dataset contains chest xrays from covid-19 patients and normal individuals.\n\n# **Import statements**\n\n#%%\n\nimport itertools\nimport os\nimport random\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nfrom sklearn.manifold import TSNE\nfrom sklearn.metrics import confusion_matrix\nimport tensorflow as tf\nimport cv2\nimport matplotlib\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display, HTML\ndisplay(HTML(\"\"\"\n\n\"\"\"))\n\n\n# **Set random seed of numpy & tensorflow**\n\n#%%\n\nnumpy_seed = 0\nnp.random.seed(numpy_seed)\ntensorflow_seed = 0\ntf.random.set_seed(tensorflow_seed)\n\n\n# **Read the filenames of the of positive and negative examples**\n\n#%%\n\ninput_dir = \"../dataset/\"\npositive_file_dirs = [input_dir+\"covid/\"+filename for filename in os.listdir(\n input_dir+\"covid/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\nnegative_file_dirs = [input_dir+\"normal/\"+filename for filename in os.listdir(\n input_dir+\"normal/\") if (\"jpeg\" in filename or \"jpg\" in filename)]\n\n\n# ## 1. Exploratory data analysis\n\n# ### 1.1 Bar chart of propotions\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (15.0, 10.0)\nobjects = ['positive', 'normal']\ny_pos = np.arange(len(objects))\ncases = [len(positive_file_dirs), len(negative_file_dirs)]\nplt.bar(y_pos, cases, align='center', alpha=0.5)\nplt.xticks(y_pos, objects, fontsize=20)\nplt.ylabel('#cases', fontsize=20)\nplt.title('Barchart of +ves & -ves', fontsize=40)\nplt.show()\n\n#%%\n\nIMG_HEIGHT = 512\nIMG_WIDTH = 512\nSIZE = len(positive_file_dirs) + len(negative_file_dirs)\nCHANNELS = 3\n\n\n# ### 1.2 Visualize positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(positive_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()\n\n\n# ### 1.3 Distribution of rgb channels of positive examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\nsns.set_style(\"darkgrid\")\nfor n, img_dir in enumerate(positive_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n if (n+1) % 5 == 0:\n plt.show()\n\n\n# ### 1.4 Visualize negative examples\n\n#%%\n\nmatplotlib.rcParams['figure.figsize'] = (25.0, 20.0)\nfor n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n img_resized = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH),\n interpolation=cv2.INTER_CUBIC)\n plt.imshow(img_resized)\n plt.title(\"shape:\"+str(img.shape))\n if (n+1) % 5 == 0:\n plt.show()\n\n#%%\n\n# matplotlib.rcParams['figure.figsize'] = (25.0, 5.0)\n# for n, img_dir in enumerate(negative_file_dirs):\n# plt.subplot(1,5,5-((n+1)%5))\n# img = cv2.imread(img_dir)\n# dims = img.shape\n# pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n# plt.hist2d(pixel_matrix[:,1], pixel_matrix[:,2], bins=(50,50))\n# if (n+1)%5==0:\n# plt.show()", "original_comment": "# ### 1.5 Distribution of rgb channels of negative examples\n", "target_code": "for n, img_dir in enumerate(negative_file_dirs):\n plt.subplot(1, 5, 5-((n+1) % 5))\n img = cv2.imread(img_dir)\n dims = img.shape\n pixel_matrix = np.reshape(img, (dims[0] * dims[1], dims[2]))\n sns.distplot(pixel_matrix[:, 1], bins=50, hist=False,\n color='g', kde_kws=dict(linewidth=10))\n sns.distplot(pixel_matrix[:, 2], bins=50, hist=False,\n color='b', kde_kws=dict(linewidth=5))\n sns.distplot(pixel_matrix[:, 0], bins=50, hist=False,\n color='r', kde_kws=dict(linewidth=3))\n", "project_metadata": {"full_name": "itratrahman/covid_19", "description": "This project contains AI and Data Science projects that analyses disease classification from images, forecasting, and EDA report of the pandemic.", "topics": [], "git_url": "git://github.com/itratrahman/covid_19.git", "stars": 5, "watchers": 5, "forks": 0, "created": "2020-03-22T03:36:28Z", "size": 26502, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6190010}, "last_updated": "2020-04-28T07:40:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=42)\n", "model": "no-comments", "intent": "# 1.5 Distribution of rgb channels of negative examples"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n\ndata.head()\n# look at the dataset\n\n\ndata.info()\n# basic info of dataset\n\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n\ndata.describe()\n\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n\ndata.columns\n\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n\n# Data Visulaization\n\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n\nsns.countplot(data['room_type'])\n\n\n# We can observe reduced preference in shared rooms\n\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n\n# heavily skewed\n\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n\ndata.corr()\n\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n\n# to plot locaation and price on NY city map\n\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)\nplt.figure(figsize=(12, 12), facecolor='black')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn import linear_model\nfrom sklearn import svm\nimport time\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.linear_model import Lasso\nfrom sklearn import metrics\nfrom sklearn.metrics import r2_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\ndata = pd.read_csv(\"AB_NYC_2019.csv\", index_col=0)\n\n#%%\n\ndata.head()\n# look at the dataset\n\n#%%\n\ndata.info()\n# basic info of dataset\n\n#%%\n\n# to check for null values\ndata.isnull().sum()\n# returns column wise count of null values\n\n#%%\n\n# Dropping property without a valid host and a valid name\n# data.dropna('name',axis=1,inplace=True)\n\n#%%\n\ndata.describe()\n\n#%%\n\nfor col in data.columns:\n print(\"{}:{}\".format(col, data[col].nunique()))\n\n#%%\n\nvalues = {'last_review': 0, 'reviews_per_month': 0}\ndata = data.fillna(value=values)\n\ndata.dropna(axis=0, how='any')\n\n#%%\n\n# We can observe that for columns price,minimum_nights,number_reviews and\n# listing count the mean is very small compare to max implying outliers\n\n#%%\n\ndata.columns\n\n#%%\n\n# to divide the numerical and categorical columns\ncat_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n\n#%%\n\nnum_col = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\n\n#%%\n\n# Data Visulaization\n\n#%%\n\nsns.countplot(data[\"neighbourhood_group\"])\n\n#%%\n\nsns.countplot(data['room_type'])\n\n#%%\n\n# We can observe reduced preference in shared rooms\n\n#%%\n\nplt.figure(figsize=(100, 100))\na = sns.countplot(data[\"neighbourhood\"], palette=\"colorblind\")\n\n#%%\n\n# heavily skewed\n\n#%%\n\nfor col in num_col:\n count3 = dict(data[col].value_counts())\n names = count3.keys()\n values = count3.values()\n fig, axs = plt.subplots(1, 1, figsize=(8, 8), sharey=True)\n #axs[0].bar(names, values)\n axs.scatter(names, values)\n #axs[2].box(names, values)\n fig.suptitle(col)\n\n#%%\n\nfor col in num_col:\n sns.boxplot(data[col])\n plt.show()\n\n#%%\n\n# The box plot implies that here are some big outliers\n# But on further observation we can conclude that these are misleading\n\n#%%\n\nfor col in num_col:\n\n sns.distplot(data[col])\n plt.show()\n\n#%%\n\n# from the isnull() we observered that there are null values in reviews per month\n# Since the distrubution is symmetric for atleast third quartile we can fill null values\n# with mean\n\n#%%\n\ndata.corr()\n\n#%%\n\nplt.figure(figsize=(10, 10))\nsns.heatmap(data.corr())\n\n#%%\n\nnum_cols = ['price', 'minimum_nights', 'number_of_reviews',\n 'reviews_per_month', 'calculated_host_listings_count',\n 'availability_365']\nfor i in num_cols:\n for j in num_cols:\n if(i != j):\n plt.figure(figsize=(10, 10))\n sns.pairplot(data, x_vars=i, y_vars=j)\n\n#%%\n\n# preliminary observations from pair plot\n# number of reviews are more for less expensive places\n# reviews per month and reviews have a strong correlation\n# minimum number of nights sees more reviews per month\n\n#%%\n\n# to plot locaation and price on NY city map\n\n#%%\n\nlocation = pd.read_csv('AB_NYC_2019.csv', usecols=[\n 'id', 'latitude', 'longitude', 'price'])\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# lat_mean=location['latitude'].mean()\n# print(lat_mean)\n\n#%%\n\n# long_mean=location['longitude'].mean()\n# NY city map inspired from u/skend\n\n#%%\n\nlat_mean = -73.925111\nlong_mean = 40.744396\nlat = location['latitude']\nlon = location['longitude']\nprice = location['price']\n\n#%%\n\ninterp_spread = 0.0002\nnlat = []\nnlon = []\nfor idx, item in enumerate(price):\n nlat.extend(np.random.uniform(\n low=lat[idx] - interp_spread, high=lat[idx] + interp_spread, size=(item,)).tolist())\n nlon.extend(np.random.uniform(\n low=lon[idx] - interp_spread, high=lon[idx] + interp_spread, size=(item,)).tolist())\n\n#%%\n\nnrbins = 3500\nspread = 0.1\nhist = np.zeros((nrbins, nrbins))\n\n# Compute the histogram with the longitude and latitude data as a source\nhist, x_ranges, y_ranges = np.histogram2d(x=nlat, y=nlon, bins=nrbins,\n range=[[long_mean - spread, long_mean + spread], [lat_mean - spread, lat_mean + spread]])\n\n# We consider the counts on a logarithmic scale\nimg = np.log(hist[::-1, :] + 1)", "original_comment": "# Plot the counts\n", "target_code": "ax = plt.subplot(1, 1, 1)\nplt.imshow(img, 'hot')\nplt.axis('off')\nplt.tight_layout()\n", "project_metadata": {"full_name": "maheshd20/Da_project_sem5", "description": null, "topics": [], "git_url": "git://github.com/maheshd20/Da_project_sem5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-30T13:22:44Z", "size": 5278, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1470956}, "last_updated": "2020-11-30T15:37:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "precision": "Disagree", "precision-score": 1, "coverage": "Agree", "coverage-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.figure(figsize=(12, 12), facecolor='black')\nplt.imshow(img)\nplt.colorbar()\nplt.show()\n", "model": "no-comments", "intent": "# Plot the counts"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n\ndf['source'].value_counts()\n\n\ndf.groupby(['source', 'partition']).size()\n\n\ndf['digit'].value_counts()\n\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n\ntarget_shape = (32, 32)\n\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n\nMODEL_DIR\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n\nwandb.init()\n\n\n# ### Run Training\n\n\nMODEL_DIR\n\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n#%%\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n#%%\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n#%%\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n#%%\n\ndf['source'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'partition']).size()\n\n#%%\n\ndf['digit'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n#%%\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n#%%\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n#%%\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n#%%\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n#%%\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n#%%\n\ntarget_shape = (32, 32)\n\n#%%\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n#%%\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n#%%\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n#%%\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n#%%\n\nwandb.init()\n\n\n# ### Run Training\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n#%%\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n#%%\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)", "original_comment": "# Initialize from pre-trained model\n", "target_code": "model.load_weights(MODEL_PATH_HEAD)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model.load_weights('/lab/repos/svhn/weights.hdf5')\n", "model": "docstring", "intent": "# Initialize from pre-trained model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')\n\n\n\ngender_amt = pd.DataFrame(fraud_df.head(100), columns=['amt', 'gender'])\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 1 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n#%%\n\nfilename = \"../fraudTrain.csv\"\n\nfraud_df = pd.read_csv(filename)\n\n# fraud_df\n\n#%%\n\nprint(fraud_df.shape)\n\n\n# ### Boxplot of Amount\n\n#%%\n\nplt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=fraud_df.head(100), width=0.4, color='mediumpurple')", "original_comment": "# ### Boxplots of Amount by Gender\n", "target_code": "plt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', x='gender', data=gender_amt, hue='gender',\n dodge=False, width=0.6, palette='Set2')\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.figure(figsize=(8, 10))\nsns.boxplot(y='amt', data=gender_amt, width=0.4, color='mediumpurple')\n", "model": "natural", "intent": "# Boxplots of Amount by Gender"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 06 - \"Supervised Machine Learning Support Vector Classification\"\n#\n# Chartered Financial Data Scientist (CFDS), Spring Term 2020\n\n# In this lab, we will use a classification technique referred to as **Support Vector Machine (SVM)**. Please recall that SVMs correspond to the class of **discriminative** classifiers as distinguished in the following illustration:\n\n# \n#\n# (Inspired by: 'Machine Learning - A Probabilistic Perspective', Kevin P. Murphy)\n\n# The *discriminative* **Support Vector Machine (SVM)** classifier is a supervised machine learning model that learns an optimal separating $n$-dimensional hyperplane to distinguish different observations of training data according to their corresponding class labels. Until recently (before to the advent of deep learning approaches) SVMs have been used in a variety of applications such as isolated handwritten digit recognition[2], object recognition[3], speaker identification[4], face detection in images[5], and text categorisation[6].\n\n# This third lab builds in parts on the excellent SVM tutorial **\"A Tutorial on Support Vector Machines for Pattern Recognition\"** developed by Christopher J.C. Burges. The original tutorial is available under the following URL: https://link.springer.com/article/10.1023/A:1009715923555.\n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab, you should be able to:\n#\n# > 1. Understand how a **Suppport Vector Machine (SVM)** classifier can be trained and evaluated.\n# > 2. Understand the impact of selected **SVM hyperparameters** and distinct kernel functions.\n# > 3. Design and extract information of **handcrafted features** from a set of arbitrary images.\n# > 3. Train and evaluate discriminative **machine learning models** using Python's `scikit-learn` library.\n# > 4. Understand how to **evaluate** and **interpret** the classification results.\n\n# Before we start, let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# OpenAI: \"Solving Rubik's Cube with a Robot Hand\"\n# YouTubeVideo('x4O8pojMF0w', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# Similar to the previous labs, we need to import a couple of Python libraries that allow for data analysis and data visualisation. In this lab will use the `Pandas`, `Numpy`, `Scikit-Learn`, `Matplotlib` and the `Seaborn` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import the numpy, scipy and pandas data science library\nimport pandas as pd\nimport numpy as np\nimport scipy as sp\nfrom scipy.stats import norm\n\n# import sklearn data and data pre-processing libraries\nfrom sklearn import datasets\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.model_selection import train_test_split\n\n# import torchvision library\nimport torchvision\n\n# import sklearn HOG feature library\nfrom skimage.feature import hog\n\n# import sklearn support vector classifier (svc) library\nfrom sklearn.svm import SVC\n\n# import sklearn classification evaluation library\nfrom sklearn import metrics\nfrom sklearn.metrics import classification_report, confusion_matrix\n\n# import matplotlib data visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Ignore potential library warnings:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Use the 'Seaborn' plotting style in all subsequent visualisations:\n\n#%%\n\nplt.style.use('seaborn')\n\n\n# Set random seed of all our experiments:\n\n#%%\n\nrandom_seed = 42\n\n\n# ## 1. Support Vector Machine (SVM) Classification\n\n# ### 1.1. Dataset Download and Data Assessment\n\n# The **Iris Dataset** is a classic and straightforward dataset often used as a \"Hello World\" example in multi-class classification. This data set consists of measurements taken from three different types of iris flowers (referred to as **Classes**), namely the Iris Setosa, the Iris Versicolour, and, the Iris Virginica) and their respective measured petal and sepal length (referred to as **Features**).\n\n# \n#\n# (Source: http://www.lac.inpe.br/~rafael.santos/Docs/R/CAP394/WholeStory-Iris.html)\n\n# In total, the dataset consists of **150 samples** (50 samples taken per class) as well as their corresponding **4 different measurements** taken for each sample. Please, find below the list of the individual measurements:\n#\n# >- `Sepal length (cm)`\n# >- `Sepal width (cm)`\n# >- `Petal length (cm)`\n# >- `Petal width (cm)`\n#\n# Further details of the dataset can be obtained from the following publication: *Fisher, R.A. \"The use of multiple measurements in taxonomic problems\" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to Mathematical Statistics\" (John Wiley, NY, 1950).\"*\n#\n# Let's load the dataset and conduct a preliminary data assessment:\n\n#%%\n\niris = datasets.load_iris()\n\n\n# Print and inspect the names of the four features contained in the dataset:\n\n#%%\n\niris.feature_names\n\n\n# Determine and print the feature dimensionality of the dataset:\n\n#%%\n\niris.data.shape\n\n\n# Determine and print the class label dimensionality of the dataset:\n\n#%%\n\niris.target.shape\n\n\n# Print and inspect the names of the three classes contained in the dataset:\n\n#%%\n\niris.target_names\n\n\n# Let's briefly envision how the feature information of the dataset is collected and presented in the data:\n\n# \n\n# Let's inspect the top five feature rows of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.data, columns=iris.feature_names).head(10)\n\n\n# Let's also inspect the top five class labels of the Iris Dataset:\n\n#%%\n\npd.DataFrame(iris.target, columns=[\"class\"]).head(10)\n\n\n# Let's now conduct a more in-depth data assessment. Therefore, we plot the feature distributions of the Iris dataset according to their respective class memberships as well as the features pairwise relationships.\n\n# Pls. note that we use Python's **Seaborn** library to create such a plot referred to as **Pairplot**. The Seaborn library is a powerful data visualisation library based on the Matplotlib. It provides a great interface for drawing informative statistical graphics (https://seaborn.pydata.org).\n\n#%%\n\n# init the plot\nplt.figure(figsize=(10, 10))\n\n# load the dataset also available in seaborn\niris_plot = sns.load_dataset(\"iris\")\n\n# plot a pairplot of the distinct feature distributions\nsns.pairplot(iris_plot, diag_kind='hist', hue='species')\n\n\n# It can be observed from the created Pairplot, that most of the feature measurements that correspond to flower class \"setosa\" exhibit a nice **linear separability** from the feature measurements of the remaining flower classes. Besides, the flower classes \"versicolor\" and \"virginica\" exhibit a commingled and **non-linear separability** across all the measured feature distributions of the Iris Dataset.\n\n# ### 1.2. Dataset Pre-Processing and Train-/Test-Split\n\n# To understand and evaluate the performance of any trained **supervised machine learning** model, it is good practice, to divide the dataset into a **training set** (the fraction of data records solely used for training purposes) and an **evaluation set** (the fraction of data records solely used for evaluation purposes). Pls. note, the **evaluation set** will never be shown to the model as part of the training process.\n\n# \n\n# We set the fraction of evaluation records to **30%** of the original dataset:\n\n#%%\n\neval_fraction = 0.3\n\n\n# Randomly split the dataset into a training set and an evaluation set using sklearns `train_test_split` function:\n\n#%%\n\n# 70% training and 30% evaluation\nx_train, x_eval, y_train, y_eval = train_test_split(\n iris.data, iris.target, test_size=eval_fraction, random_state=random_seed, stratify=None)\n\n\n# Evaluate the dimensionality of the training dataset $x^{train}$:\n\n#%%\n\nx_train.shape, y_train.shape\n\n\n# Evaluate the dimensionality of the evaluation dataset $x^{eval}$:\n\n#%%\n\nx_eval.shape, y_eval.shape\n\n\n# ### 1.3. Support Vector Machine (SVM) Classification\n\n# Let's suppose we are given $l$ observations. Each observation consists of a pair: a vector $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ and the associated \"truth\" $y_{i}$, provided by a trusted source. In the context of a face detection task, $x_{i}$ might be vector of pixel values (e.g. $n$=256 for 1024x1024 pixel image), and $y_{i}$ would be $1$ if the image contains a face, and $-1$ otherwise.\n\n# #### 1.3.2. Linear Support Vector Machine (SVM) Classifiers - The Linear Separable Case\n\n# Suppose we have some hyperplane which separates the positive from the negative examples referred to as \"separating hyperplane\". The points $x$ which lie on the hyperplane satisfy the following equation $w \\cdot x + b = 0$, where $w$ is normal to the hyperplane, $|b|/||w||$ is the perpendicular distance from the hyperplane to the origin, and $||w||$ is the Euclidean norm of $w$. Let $d_{+}$ ($d_{-}$) be the shortest distance from the separating hyperplane to the closest positive (negative) example. We define the \"margin\" of a separating hyperplane to be $d_{+} + d_{-}$. In the context of the linearly separable case, the support vector algorithm simply looks for the separating hyperplane with the maximum margin.\n\n# \n#\n# Linear separating hyperplanes $H_{1}$, $H_{2}$, and $H^{*}$ for the separable case. The support vectors that constitute $H_{1}$, $H_{2}$ are circled.\n#\n# (Source: https://link.springer.com/article/10.1023/A:1009715923555)\n\n# Suppose that all the training data satisfies the following constraints:\n\n# $$ x_{i} \\cdot w + b \\geq + 1, y_{i} = +1 $$\n#\n# $$ x_{i} \\cdot w + b \\leq - 1, y_{i} = -1 $$\n\n# This can be combined into one set of inequalities:\n\n# $$y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0, \\forall_{i}$$\n\n# Let's now consider the points for which the equality $x_{i} \\cdot w + b \\geq + 1$ holds. These points lie on a hyperplane $H_{1}: x_{i} \\cdot w + b = + 1$ with normal $w$ and perpendicular distance from the origin $|1-b|/||w||$. Similarly, the points for which the equality $x_{i} \\cdot w + b \\leq - 1$ holds lie on the hyperplane $H_{2}: x_{i} \\cdot w + b = -1$, with normal again $w$, and perpendicular distance from the origin $|-1-b|/||w||$. Hence $d_{+} = d_{-} = 1 / ||w||$ and the margin is simply 2/||w||. Note that $H_{1}$ and $H_{2}$ are parallel and that no training points $x_{i}$ fall between them. Thus we can find a pair of hyperplanes which correspond to a maximum margin by minimizing $||w||^{2}$, subject to constraint $y_{i}(x_{i} \\cdot w + b) - 1 \\geq 0$. Those training points $x_{i}$ which wind up lying on one of the hyperplanes $H_{1}$, $H_{2}$, and whose removal would change the solution found, are referred to as **\"support vectors\"**.\n\n# #### A \"Primal\" Optimization Objective Formulation\n\n# As discussed in the lecture, we can reformulate the objective of finding such a max-margin seperating hyperplane as a Lagrangian optimization objective. Thereby, we introduce a set of positive Lagrange multipliers $\\alpha_{i}, i=1, ..., l$ which turns the search for a max-margin seperating hyperplane into solving the following Lagrangian:\n\n# $$L_{P} = \\frac{1}{2}||w||^{2} - \\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b) + \\sum_{i=1}^{l}\\alpha_{i}$$\n\n# We must now minimize $L_{P}$, referred to as the **\"primal\"**, with respect to $w$, $b$. Thereby,\n#\n# > 1. the minimization of the first term $\\frac{1}{2}||w||^{2}$ maximizes the margin of the separating hyperplane,\n# > 2. the maximization of the second term $\\sum_{i=1}^{l} \\alpha_{i}y_{i}(x_{i} \\cdot w + b)$ maximizes the number of correctly classfied training samples,\n# > 3. the minimization of the third term $\\sum_{i=1}^{l}\\alpha_{i}$ minimizes the number of support vectors.\n\n# Minimization of $L_{P}$ is a convex quadratic programming problem, since the objective function is itself convex, and those points for which $\\alpha_{i} > 0$ that satisfy the constraints also form a convex set. Again, those points are called \"support vectors\", and lie on one of the hyperplanes $H_{1}$, $H_{2}$.\n\n# #### A \"Dual\" Optimization Objective Formulation\n\n# Requiring that the gradient of $L_{P}$ with respect to $w$ and $b$ vanish result in the conditions, that $w = \\sum_{i=1}^{l} \\alpha_{i}y_{i}x_{i}$ and $\\sum_{i=1}^{l}\\alpha_{i}y_{i} = 0$. Using those conditions, the above shown Lagrangian can be reformulated to derive its **\"dual\"** formulation:\n\n# $$L_{D} = \\sum_{i=1}^{l}\\alpha_{i} + \\frac{1}{2} \\sum_{i,j=1}^{l} \\alpha_{i}\\alpha_{j}y_{i}y_{j}$$\n\n# Note that solving the dual formulation doesn't depend on $w$ anymore. It only depends on the samples $x_{i} \\in \\mathbb{R}^{n}, i=1, ..., l$ of the training dataset as well as the associated labels $y_{i}$. This indicates that the optimal seperating hyperplane $H^{*}$ becomes a linear function of the data. Note also that if we formulate the problem, as above, with $b=0$, requires that all hyperplanes contain the origin. However, this is a mild restriction for high dimensional spaces since it amounts to reducing the number of degrees of freedom by one.\n\n# #### 1.3.3. Training of a Linear Support Vector Machine (SVM) Classifer using Python's Scikit-Learn Library\n\n# Luckily, the `Scikit-Learn` (https://scikit-learn.org) machine learning library provides a variety of machine learning algorithms that can be easily interfaced using the Python programming language. Among others the library also contains a variety of supervised classification algorithms such as the **Support Vector Machine (SVM)** classifier. The SVM classifier can be trained \"off-the-shelf\" to solve the dual Lagrangian $L_{D}$ optimization objective formulated above. Let's instantiate one of the SVM classifiers available in `Scikit-Learn` to learn a linear seperating hyperplane:\n\n#%%", "original_comment": "# init the Support Vector Machine classifier\n", "target_code": "from sklearn.svm import SVC\n\nsvm = SVC(kernel='linear', random_state=random_seed)\n", "project_metadata": {"full_name": "financial-data-science/CFDS", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-science", "financial-data-analysis", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS.git", "stars": 16, "watchers": 16, "forks": 10, "created": "2019-10-11T18:13:38Z", "size": 46128, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2359002}, "last_updated": "2021-01-08T06:48:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "scaler = MinMaxScaler()\nx_train = scaler.fit_transform(x_train)\nx_eval = scaler.transform(x_eval)\n", "model": "no-comments", "intent": "# init the Support Vector Machine classifier"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n\n# shape\ndf.shape\n\n\ndf\n\n\n# top 5 rows in dataframe\ndf.head()\n\n\ndf.info()\n\n\ndf.describe()\n\n\n# statistical details T is transpost\ndf.describe().T\n\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n\nX\n\n\ny\n\n\n# ## split this data into training and test sets\n\n\n\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n\n\n\n\nmodel = LinearRegression()\n\n\nmodel\n\n\n# ## Train model\n\n\nmodel.fit()\n\n\nmodel.fit(X_train, y_train)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Linear Regression\n\n#%%\n\n# Basic Library\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ----\n\n# ## Load Dataset\n\n#%%\n\ndf = pd.read_csv(\"dataset/student_scores.csv\")\n\n#%%\n\n# shape\ndf.shape\n\n#%%\n\ndf\n\n#%%\n\n# top 5 rows in dataframe\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.describe()\n\n#%%\n\n# statistical details T is transpost\ndf.describe().T\n\n#%%\n\n# plot 2-D graph find any relationship between the data\ndf.plot(x='Hours', y='Scores', style='o', figsize=(9, 9))\nplt.title('Hours Vs. Scores', fontsize=20)\nplt.xlabel('Hours', fontsize=20)\nplt.ylabel('Scores', fontsize=20)\nplt.show()\n\n#%%\n\n# Preparing the Data\nX = df[\"Hours\"].values\ny = df[\"Scores\"].values\n\n#%%\n\nX\n\n#%%\n\ny\n\n\n# ## split this data into training and test sets\n\n#%%\n\n\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.2, random_state=0)\n\n\n# ## Modelling\n\n#%%\n\n\n\n#%%\n\nmodel = LinearRegression()\n\n#%%\n\nmodel\n\n\n# ## Train model\n\n#%%\n\nmodel.fit()\n\n#%%\n\nmodel.fit(X_train, y_train)", "original_comment": "# ### Reshape Feature\n", "target_code": "X = X.reshape(-1, 1)\n", "project_metadata": {"full_name": "Jetsukda/ML-KBTGxMeowCode", "description": "Say \"Hello\" Machine Learning by KBTGxMeowCode", "topics": [], "git_url": "git://github.com/Jetsukda/ML-KBTGxMeowCode.git", "stars": 3, "watchers": 3, "forks": 34, "created": "2020-06-28T07:57:09Z", "size": 5316, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 11012140}, "last_updated": "2020-09-01T17:59:00Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))\nX_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))\nprint(X_train.shape)\nprint(X_test.shape)\n", "model": "natural", "intent": "# Reshape Feature"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n\nleads_scoring.head(5)\n\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n\nround(leads_scoring.describe(), 2)\n\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n\nleads_scoring_model.info()\n\n\nleads_scoring_model.shape\n\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n\n\n\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n\n\n\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n\nX_train.describe()\n\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n\n\n\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n\nlogreg = LogisticRegression()\n\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n\n\n\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n\ncols = X_train.columns[rfe.support_]\n\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n\ncols\n\n\nX_train.shape\n\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n\n\n\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n#%%\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n#%%\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n#%%\n\nleads_scoring.head(5)\n\n#%%\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n#%%\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n#%%\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n#%%\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n#%%\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n#%%\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n#%%\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n#%%\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n#%%\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n#%%\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n#%%\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n#%%\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n#%%\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n#%%\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n#%%\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n#%%\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n#%%\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n#%%\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n#%%\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n#%%\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n#%%\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n#%%\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n#%%\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n#%%\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n#%%\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n#%%\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n#%%\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n#%%\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n#%%\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n#%%\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n#%%\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n#%%\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n#%%\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n#%%\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n#%%\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n#%%\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n#%%\n\nleads_scoring_model.info()\n\n#%%\n\nleads_scoring_model.shape\n\n#%%\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n#%%\n\n\n\n#%%\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n#%%\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n#%%\n\n\n\n#%%\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n#%%\n\nX_train.describe()\n\n#%%\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n#%%\n\n\n\n#%%\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n#%%\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n#%%\n\nlogreg = LogisticRegression()\n\n#%%\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n#%%\n\n\n\n#%%\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n#%%\n\ncols = X_train.columns[rfe.support_]\n\n#%%\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n#%%\n\ncols\n\n#%%\n\nX_train.shape\n\n#%%\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n#%%\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n#%%\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n#%%\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n#%%\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n#%%\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n#%%\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n#%%\n\n\n\n#%%\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n#%%", "original_comment": "# Let's check the overall accuracy.\n", "target_code": "print(metrics.accuracy_score(y_train_pred_final.Converted,\n y_train_pred_final.Predicted_Conversion))\n", "project_metadata": {"full_name": "saad1504/Upgrad_DataScience_Projects", "description": "All Data Science projects completed for PGPDS by Upgrad", "topics": [], "git_url": "git://github.com/saad1504/Upgrad_DataScience_Projects.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2019-10-14T16:57:22Z", "size": 29931, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6008971, "PLSQL": 11605}, "last_updated": "2020-10-12T22:18:23Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion))\n", "model": "natural", "intent": "# Let's check the overall accuracy."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n\nic.fit(inputs=s3_channels)\n\n\n\nendpoint_name = 'c5-'+time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom IPython.display import Image\nimport time\nfrom sagemaker.amazon.amazon_estimator import get_image_uri\nimport boto3\nimport sagemaker\n\nsession = sagemaker.Session()\nbucket = session.default_bucket()\n\n#%%\n\nprefix = 'dogscats'\ns3_train_path = 's3://{}/{}/input/train/'.format(bucket, prefix)\ns3_val_path = 's3://{}/{}/input/validation/'.format(bucket, prefix)\ns3_output = 's3://{}/{}/output/'.format(bucket, prefix)\n\nprint(s3_train_path)\nprint(s3_val_path)\nprint(s3_output)\n\n\n# ### Get the name of the image classification algorithm in our region\n\n#%%\n\nregion_name = boto3.Session().region_name\ncontainer = get_image_uri(region_name, \"image-classification\", \"latest\")\nprint(container)\n\n\n# ### Configure the training job\n\n#%%\n\nrole = sagemaker.get_execution_role()\n\nic = sagemaker.estimator.Estimator(container,\n role,\n train_instance_count=1,\n train_instance_type='ml.p3.2xlarge',\n output_path=s3_output,\n sagemaker_session=session)\n\n\n# ### Set algorithm parameters\n\n#%%\n\n#precision_dtype = 'float16'\nprecision_dtype = 'float32'\n\nic.set_hyperparameters(num_layers=18, # Train a Resnet-18 model\n use_pretrained_model=0, # Train from scratch\n num_classes=2, # Dogs and cats\n num_training_samples=22500, # Number of training samples\n mini_batch_size=128,\n precision_dtype=precision_dtype,\n epochs=10) # Learn the training samples 10 times\n\n\n# ### Set dataset parameters\n\n#%%\n\ntrain_data = sagemaker.session.s3_input(s3_train_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\nvalidation_data = sagemaker.session.s3_input(s3_val_path,\n distribution='FullyReplicated',\n content_type='application/x-recordio',\n s3_data_type='S3Prefix')\n\ns3_channels = {'train': train_data, 'validation': validation_data}\n\n\n# ### Train the model\n\n#%%\n\nic.fit(inputs=s3_channels)", "original_comment": "# ### Deploy the model\n", "target_code": "c5_predictor = ic.deploy(initial_instance_count=1,\n instance_type='ml.c5.large',\n endpoint_name=endpoint_name,\n wait=False)\n", "project_metadata": {"full_name": "PacktPublishing/Learn-Amazon-SageMaker", "description": "Learn Amazon SageMaker", "topics": [], "git_url": "git://github.com/PacktPublishing/Learn-Amazon-SageMaker.git", "stars": 30, "watchers": 30, "forks": 20, "created": "2020-04-22T14:55:25Z", "size": 47447, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2818256, "Python": 146100, "R": 2078, "Dockerfile": 738}, "last_updated": "2020-12-29T08:53:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "ic_classifier = ic.deploy(initial_instance_count=1,\n instance_type='ml.m4.xlarge')\n", "model": "natural", "intent": "# Deploy the model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n#%%\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n#%%\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n#%%\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n#%%\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n#%%\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n#%%\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n#%%\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n#%%\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n#%%\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n#%%\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n#%%", "original_comment": "# Plot the histogram for the values of each year.\n", "target_code": "plt.figure()\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.figure()\nsns.distplot(gdp['2007'])\nplt.figure()\nsns.distplot(gdp['2008'])\nplt.figure()\nsns.distplot(gdp['2009'])\n", "model": "no-comments", "intent": "# Plot the histogram for the values of each year."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n#%%\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n#%%\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n#%%\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n#%%\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n#%%\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n#%%\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n#%%\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n#%%\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)", "original_comment": "# ### The following code cells shape the new arrays.\n", "target_code": "np.shape(d_norm)\n", "project_metadata": {"full_name": "abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation", "description": "initial commit", "topics": [], "git_url": "git://github.com/abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2020-11-06T17:59:31Z", "size": 13456, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5835338, "Python": 6376}, "last_updated": "2020-11-12T20:56:51Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig = plt.figure(figsize=(10, 10))\nax = fig.add_subplot(111)\nax.scatter(x, y)\nax.set_xlabel('HH_income_less_35k')\nax.set_ylabel('Total_Population')\nplt.show()\n", "model": "no-comments", "intent": "# shape array"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n#%%\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n#%%\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n#%%\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)", "original_comment": "# print out all data shapes\n", "target_code": "print('Training set', train_dataset.shape, train_labels.shape)\nprint('Validation set', validation_dataset.shape, validation_labels.shape)\nprint('Test set', test_dataset.shape, test_labels.shape)\n", "project_metadata": {"full_name": "cwru-robotics/cwru_dnn", "description": "deep neural net explorations", "topics": [], "git_url": "git://github.com/cwru-robotics/cwru_dnn.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-07-25T14:47:31Z", "size": 49625, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 329694, "Python": 19000, "C++": 17781, "CMake": 7310}, "last_updated": "2020-03-13T14:59:53Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "tf.debugging.info('train_dataset:', train_dataset.shape)\ntf.debugging.info('validation_dataset:', validation_dataset.shape)\ntf.debugging.info('test_dataset:', test_dataset.shape)\n", "model": "docstring", "intent": "# print out all data shapes"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\ndef frequencyDists(wine_set):\n print(\"This is the frequency distribution of the wines' quality.\")\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n#%%\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n#%%\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n#%%\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n#%%\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n#%%\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n#%%\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n#%%\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\n#%%", "original_comment": "# print frequency distributions of wines' quality\n", "target_code": " print(wine_set.groupby(\"quality\").size()*100 / len(wine_set))\n", "project_metadata": {"full_name": "shrikant-temburwar/Wine-Quality-Dataset", "description": null, "topics": [], "git_url": "git://github.com/shrikant-temburwar/Wine-Quality-Dataset.git", "stars": 7, "watchers": 7, "forks": 13, "created": "2018-06-11T14:03:02Z", "size": 575, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 670078}, "last_updated": "2020-12-16T12:41:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "wine_set.quality.value_counts()\n", "model": "natural", "intent": " # print frequency distributions of wines' quality"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Creating LDR folders\n\n#%%\n\nimport sys\nimport itertools\nimport matplotlib.mlab as mlab\nimport matplotlib.pylab as plt\nimport collections\nimport matplotlib.pyplot as plt\nimport math\nimport random\nimport re\nimport os\nimport h5py\nimport cv2\nimport numpy as np\nimport imageio\nimport glob\nimport PIL\nfrom PIL import Image\nimport subprocess as sp\n\ncount = 0\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*/')\n\nfor paths in file_list:\n\n tmp_list = []\n ldr = glob.glob('%s/*.png' % (paths))\n\n for items in ldr:\n\n subfolder = items.split('/')[6]\n subfolder = subfolder.split('_')[0]\n\n if '_cc' in items:\n subfolder = subfolder+'_CC'\n\n if subfolder not in tmp_list:\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_ldr/%s' % (\n subfolder)\n sp.Popen(cmd, shell=True)\n\n tmp_list.append(subfolder)\n\n\n# # Create hdr folders\n\n#%%\n\nfile_list = glob.glob('/misc/lmbraid18/bharadwk/data/hdr/*')\n\nfor items in file_list:\n items = items.split('/')[6]\n items = items.split('.')[0]\n cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data_hdr/%s' % (items)\n sp.Popen(cmd, shell=True)\n\n\n# # LDR IMAGE SCALING\n\n#%%\n\ndir_list = []\ndir_list = glob.glob('/misc/lmbraid18/bharadwk/data/data*')\n\nfor paths in dir_list:\n\n subfolder = paths.split('/')[5]\n #cmd = 'mkdir /misc/lmbraid18/bharadwk/scaled_data/%s' %(subfolder)\n #sp.Popen(cmd, shell=True)\n # print(subfolder)\n #paths = paths+'*.png'\n img_list = glob.glob(\"%s/*.png\" % (paths))\n new_img_list = []\n tmp_list = []\n\n for i in range(0, len(img_list)):\n img_name = ''\n ximg_name = ''\n count = 0\n for j in range(0, len(img_list)):\n\n img_name = img_list[i].split('/')[6]\n ximg_name = img_list[j].split('/')[6]\n\n if 'cc' in img_name:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd + '_CC'\n\n else:\n name_list = []\n name_list = img_name.split('_')\n img_name, shtspd = name_list[0], name_list[3]\n img_name = img_name + '_' + shtspd\n\n if 'cc' in ximg_name:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd + '_CC'\n\n else:\n xname_list = []\n xname_list = ximg_name.split('_')\n ximg_name, shtspd = xname_list[0], xname_list[3]\n ximg_name = ximg_name + '_' + shtspd\n\n if img_name == ximg_name and ximg_name not in tmp_list:\n\n count = count + 1\n image_name = ximg_name + '_' + str(count)\n new_img_list.append(image_name)\n\n # Resizing width:\n new_width = 2080\n\n # Resizing height:\n new_height = 1408\n\n img = Image.open(img_list[j])\n #img = img.resize((new_width, new_height), Image.ANTIALIAS)\n img.save('/misc/lmbraid18/bharadwk/scaled_data_ldr/%s/%s.png' %\n (ximg_name, image_name))\n\n tmp_list.append(img_name)\n\n\n# # HDR IMAGE SCALING\n\n#%%\n\nimageio.plugins.freeimage.download()\n\nxmin_list = []\nxmax_list = []\nfiles = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*/*.exr')\nfor paths in files:\n #filename = paths.split('/')[6]\n #filename = filename.split('.')[0]\n img = cv2.imread(paths, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #arry = np.array(img, dtype=np.float32)\n xmax_list.append(img.max())\n xmin_list.append(img.min())\n #arry.resize(2080, 1408, 3)\n #imageio.imwrite('/misc/lmbraid18/bharadwk/scaled_data_hdr/%s/%s.hdr' %(filename, filename), img)\n\n#%%\n\nimg = imageio.imread(\n \"/misc/lmbraid18/bharadwk/scaled_data_hdr/S0160_CC/S0160_CC.hdr\", format='HDR-FI')\n#arry = np.array(img, dtype=np.float32)\n#arry.resize(2080, 1408, 3)\n#imageio.imwrite('/misc/lmbraid18/bharadwk/pfstools-2.0.6/build/src/pfsview/S0010_updated.hdr', arry)\nprint(img.max())\nprint(img.min())\n\n\n# # Normalize PNG (0,1)\n\n#%%\n\nimage = cv2.imread(\"/misc/lmbraid18/bharadwk/LDR_render_files/3DGRASSFIELD/image_1.png\",\n cv2.IMREAD_COLOR) # uint8 image\nnorm_image = np.zeros((800, 800))\nnorm_image = cv2.normalize(image, norm_image, alpha=0,\n beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)\nprint(norm_image)\n\n\n# # Create HDF5 for OpenExr files\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/colorVar')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/colorVar')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n PNGfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n EXRfiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.exr\" % (ldr_img))\n\n X_data1 = []\n arr_list = []\n name_list = []\n\n for pngFile in sorted(PNGfiles, key=keyFunc):\n image1 = cv2.imread(pngFile)\n X_data1.append(image1)\n for exrFile in sorted(EXRfiles, key=keyFunc):\n name = exrFile.split('/')[6]\n if name not in name_list:\n name_list.append(name)\n image2 = cv2.imread(\n exrFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image2)\n break\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/hdr.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages_HDRLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create HDF5 for shuffled LDR images\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n #image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n\n random.shuffle(X_data1)\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 0, 2)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list, dtype=np.float32)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_shuffleLDR/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Split LDR in 10 different datasets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_split/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.png\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split LDR in 10 different datasets and shuffle and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set_split_shuffleLDR/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/LDR_render_files/%s/*.png\" % (ldr_img))\n random.shuffle(files)\n count = 0\n\n for myFile in files:\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(myFile)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Split HDR candidates in 10 different datsets and create HDF5\n\n#%%\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/test_LDR_render_files/*')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n h5file = h5py.File(\n '/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set_HDRCandidate/%s.h5' % (hdr_img), 'a')\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/test_LDR_render_files/%s/*.exr\" % (ldr_img))\n count = 0\n\n for myFile in sorted(files, key=keyFunc):\n\n count = count + 1\n X_data1 = []\n image = cv2.imread(\n myFile, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n X_data1.append(image)\n X_data1 = np.array(X_data1, dtype=np.float32)\n X_data1 = np.swapaxes(X_data1, 2, 3)\n X_data1 = np.swapaxes(X_data1, 1, 2)\n\n dataset = h5file.create_dataset(\n 'data%d' % (count), data=X_data1)\n\n if count == 10:\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/test_hdr_rendered_image/%s/hdr_image.exr' % (\n hdr_img)\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n dataset2 = h5file.create_dataset('hdr', data=hdr_array)\n\n h5file.close()\n\n\n# # Create HDF5 for .hdr files\n\n#%%\n\nimageio.plugins.freeimage.download()\n\n\ndef keyFunc(afilename):\n nondigits = re.compile(\"\\D\")\n return int(nondigits.sub(\"\", afilename))\n\n\nldr_files = glob.glob('/misc/lmbraid18/bharadwk/dataLDR/realkitchen')\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/dataHDR/realkitchen')\n\nfor items1 in ldr_files:\n for items2 in hdr_files:\n\n ldr_img = items1.split('/')[5]\n hdr_img = items2.split('/')[5]\n\n if ldr_img == hdr_img:\n\n files = glob.glob(\n \"/misc/lmbraid18/bharadwk/dataLDR/%s/*.JPG\" % (ldr_img))\n X_data1 = []\n arr_list = []\n\n for myFile in sorted(files, key=keyFunc):\n\n image = cv2.imread(myFile)\n X_data1.append(image)\n\n X_data2 = np.array(np.dstack(X_data1), dtype=np.float32)\n X_data2 = np.swapaxes(X_data2, 2, 3)\n X_data2 = np.swapaxes(X_data2, 1, 2)\n arr_list.append(X_data2)\n im_array = np.array(arr_list)\n\n hdr_list = []\n hdr_path = '/misc/lmbraid18/bharadwk/dataHDR/%s/%s.exr' % (\n hdr_img, hdr_img)\n #hdr_image = imageio.imread(hdr_path, format='HDR-FI')\n hdr_image = cv2.imread(\n hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_list.append(hdr_image)\n hdr_array = np.array(hdr_list, dtype=np.float32)\n hdr_array = np.swapaxes(hdr_array, 2, 3)\n hdr_array = np.swapaxes(hdr_array, 1, 2)\n\n with h5py.File('/misc/lmbraid18/bharadwk/workspace/ws1/projectimages/%s.h5' % (hdr_img)) as hdf:\n\n D1 = hdf.create_dataset('data', data=im_array)\n D2 = hdf.create_dataset('hdr', data=hdr_array)\n\n\n# # Create the TRAINFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_training_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Create the TESTFILE list\n\n#%%\n\nfo = open(\"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_list.txt\", \"w\")\nfile_list = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/new_test_set/*.h5\")\nl_o_list = len(file_list)\n# print l_o_list\n\nfor i in range(0, l_o_list):\n fo.write(file_list[i])\n fo.write('\\n')\nfo.close()\n\n\n# # Find the MIN & MAX value of the .EXR files\n\n#%%\n\nhdr_files = glob.glob('/misc/lmbraid18/bharadwk/test_hdr_rendered_image/*')\nfo = open('/misc/lmbraid18/bharadwk/workspace/test_min_max3.txt', 'a')\nfo.write('{a:^0}{b:^50}{c:^50}'.format(\n a='Image Name', b='Min Value', c='Max Value'))\nfo.write('\\n\\n')\n\nfor paths in hdr_files:\n xpaths = paths\n hdr_img = xpaths.split('/')[5]\n hdr_path = paths+'/hdr_image.exr'\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdr_arr = np.array(hdr_image)\n #print (\"Min value\", hdr_arr.min())\n #print (\"Max value\", hdr_arr.max())\n #print (\"\\n\")\n fo.write('{:>0}'.format(str(hdr_img)))\n fo.write('{:>50}'.format(str(hdr_arr.min())))\n fo.write('{:>50}'.format(str(hdr_arr.max())))\n fo.write('\\n')\nfo.close()\n\n\n# # Converting NAN values in .EXR to '0'\n\n#%%\n\nhdr_image = cv2.imread(\"/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr\",\n cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\nhdr_array = np.array(hdr_image, dtype=np.float32)\nhdr_array = np.nan_to_num(hdr_array)\ncv2.imwrite('/misc/lmbraid18/bharadwk/tmp1/Balcony5/hdr_image.exr', hdr_array)\n\n\n# # Normalize the MAX and MIN range\n\n#%%\n\nL = min_list + max_list\n\nmax_val = max(L)\nmin_val = min(L)\ndiff = max_val - min_val\nnorm_list = []\n\nfor items in L:\n items = np.float32(items)\n x = (items - min_val)/diff\n norm_list.append(x)\n\n\n# # Plot loss for training data\n\n#%%\n\nlog_list = []\nloss_list = []\nmean_loss_list = []\ncount1 = 5000\ncount2 = 0\nx_list = []\n\nlog_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr8-*.log'))\n\nfor items in log_list:\n mean = 0.0\n loss_list = []\n filename = items.split('/')[6]\n fo1 = open(items, 'r')\n filelist = fo1.readlines()\n for pos, xitems in enumerate(filelist):\n if 'Train net output' in xitems:\n count2 = count2 + 1\n loss = 0.0\n loss = filelist[pos].split('=')[1]\n loss = loss.split('(')[0]\n loss = float(loss)\n loss_list.append(loss)\n\n if count2 == count1:\n x_list.append(count1)\n mean = np.mean(loss_list)\n mean_loss_list.append(mean)\n count1 = count1 + 5000\n\nmean_loss_array = np.asarray(mean_loss_list)\nx_list_array = np.asarray(x_list)\nplt.plot(x_list, mean_loss_list)\nplt.ylabel('l1 norm loss')\nplt.xlabel('No of iterations')\nplt.show()\n\n\n# # Compute PSNR\n\n#%%\n\nrefList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/testHDRImages/*.exr\")\npredList = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\nerrEstList = []\nprint(len(refList), len(predList))\n# for images in imgList:\nfor predPath, refPath in zip(predList, refList):\n\n pred = cv2.imread(predPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #pred = pred[0:468,0:628]\n #pred = pred[0:446,0:606]\n ref = cv2.imread(refPath, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n #ref = ref[0:468,0:628]\n #ref = ref[0:446,0:606]\n maxVal = ref.max()\n sqrdErr = np.sum((pred.astype(\"float\") - ref.astype(\"float\"))\n ** 2)/float(pred.shape[0] * pred.shape[1])\n errEst = 20*(np.log10(maxVal/sqrdErr))\n errEstList.append(errEst)\n #print (errEst)\n\nprint(np.mean(errEstList))\n\n\n# # Converting HDF5 to EXR\n\n#%%\n\nh5Dir = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.h5\")\n\nfor h5s in h5Dir:\n fileName = h5s.split('/')[7]\n fileName = fileName.split('.')[0]\n file = h5py.File(h5s, 'r')\n dataset = file[\"prediction\"]\n #dataset = file[\"hdr\"]\n xdataset = np.array(dataset, dtype=np.float32)\n xdataset = np.swapaxes(xdataset, 1, 3)\n xdataset = np.swapaxes(xdataset, 1, 2)\n xdataset = np.squeeze(xdataset, axis=0)\n file.close()\n\n cv2.imwrite('/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.exr' %\n (fileName), xdataset)\n\n\n# # ToneMap the HDR to get PNG using OpenCV\n\n#%%\n\nEXRFiles = glob.glob(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/*.exr\")\n\nfor EXRs in EXRFiles:\n\n EXRName = EXRs.split('/')[7]\n EXRName = EXRName.split('.')[0]\n hdr_image = cv2.imread(EXRs, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n tonemap = cv2.createTonemap(gamma=1.3)\n #tonemap = cv2.createTonemapMantiuk(gamma=1.3)\n res = tonemap.process(hdr_image.copy())\n img = np.clip(res*255, 0, 255).astype('uint8')\n cv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/%s.jpg\" % (EXRName), img)\n\n\n# # Different ToneMapping\n\n#%%\n\n# Tonemap HDR image\nhdr_path = '/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/score_iter_00012.h5.exr'\nhdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\ntonemap1 = cv2.createTonemapDurand(gamma=1.3)\nres_debvec = tonemap1.process(hdr_image.copy())\ntonemap2 = cv2.createTonemapDurand(gamma=1.3)\nres_robertson = tonemap2.process(hdr_image.copy())\n\n# Convert datatype to 8-bit and save\nres_debvec_8bit = np.clip(res_debvec*255, 0, 255).astype('uint8')\nres_robertson_8bit = np.clip(res_robertson*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/durand.jpg\", res_debvec_8bit)\ncv2.imwrite(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter_deformation_l2perloc/ldr_robertson.jpg\", res_robertson_8bit)\n#cv2.imwrite(\"fusion_mertens.jpg\", res_mertens_8bit)\n\ntonemap3 = cv2.createTonemapReinhard(gamma=1.3)\ntonemap4 = cv2.createTonemapMantiuk(gamma=1.3)\ntonemap5 = cv2.createTonemapDrago(gamma=1.3)\ntonemap6 = cv2.createTonemap(gamma=1.3)\n\nres1 = tonemap3.process(hdr_image.copy())\nres2 = tonemap4.process(hdr_image.copy())\nres3 = tonemap5.process(hdr_image.copy())\nres4 = tonemap6.process(hdr_image.copy())\n\nimg1 = np.clip(res1*255, 0, 255).astype('uint8')\nimg2 = np.clip(res2*255, 0, 255).astype('uint8')\nimg3 = np.clip(res3*255, 0, 255).astype('uint8')\nimg4 = np.clip(res4*255, 0, 255).astype('uint8')\n\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/rerinhard.jpg\", img1)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/mantiuk.jpg\", img2)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/drago.jpg\", img3)\ncv2.imwrite(\n \"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_snapshot_iter/normal.jpg\", img4)\n\n\n# # Test all the caffe models in a sequence\n\n#%%\n\ncaffee_list = []\ncaffee_list = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr/*.caffemodel'))\n\nfor models in caffee_list:\n models = models.split('/')[7]\n cmd = \"./hdr-test.sh models\"\n os.system(cmd)\n\n\n# # Constructing a dictionary of no_of_iter to mean_loss_val\n\n#%%\n\nlog_file = []\nfile_list = []\nloss_list_mean = {}\n\nlog_file = sorted(glob.glob(\n '/misc/lmbraid18/bharadwk/workspace/ws1/testlogs/hdr_test_hdr_snapshot*.log'))\n\nfor logs in log_file:\n fo = open(logs, \"r\")\n logs = logs.split('/')[7]\n logs = logs.split('_')[5]\n logs = logs.split('.')[0]\n file_list = fo.readlines()\n loss_list = []\n mean = 0.0\n\n for pos, xitems in enumerate(file_list):\n if 'Successfully saved 1 blobs' in xitems:\n\n pos = pos + 1\n tmp_str = file_list[pos]\n tmp_str = tmp_str.split('=')[1]\n loss_list.append(float(tmp_str))\n\n mean = np.mean(loss_list)\n loss_list_mean.update({logs: mean})\n\nloss_list_mean = collections.OrderedDict(\n sorted(loss_list_mean.items(), key=lambda t: len(t[0])))\n\n#%%\n\nprint(min(loss_list_mean.items(), key=lambda x: x[1])[0])\nprint(loss_list_mean.get('120000'))\n#print (loss_list_mean.get('500000'))\n#print (len(loss_list_mean.values()))\n#print (loss_list_mean.values())\n\n\n# # Train error against Test error\n\n#%%\n\nlist_keys = list(loss_list_mean.keys())\nlist_values = list(loss_list_mean.values())\n#list_keys = list_keys[:20]\n#list_values = list_values[0:20]\n#print (list_values[0:40])\n#print (list_values)\n#keys_array = np.asarray(list_keys)\n#value_array = np.asarray(list_values)\n#plt.plot(keys_array, mean_loss_array, 'r')\n#plt.plot(keys_array, value_array, 'g')\n# plt.show()\n\n#%%\n\nplt.plot(sorted(list_keys), list_values, 'g')\nplt.show()\n# plt.savefig(\"/misc/lmbraid18/bharadwk/workspace/ws1/hdr_test_logs_philaug/philaugtgtest.jpg\")\n\n#%%\n\ncountRows = []\n# countRows=np.array(countRows)\nsigma = 500\ncount = 5000\nlog_file = []\nlog_file = sorted(\n glob.glob('/misc/lmbraid18/bharadwk/workspace/ws1/hdr14*.txt'))\n\nfor network in log_file:\n lossFile = network.split('/')[6]\n loss = np.loadtxt(network, dtype=np.float32,\n delimiter=',', skiprows=1, usecols=(0, 3))\n filter = np.exp(-4.0 * (np.arange(-4*sigma, 4*sigma + 1, 1) /\n sigma)**2) / (math.sqrt(2 * math.pi) * sigma)\n lossPadded = np.concatenate((np.ones(int(\n filter.shape[0] / 2)) * loss[0, 1], loss[:, 1], np.ones(int(filter.shape[0] / 2)) * loss[-1, 1]))\n lossFiltered = np.convolve(lossPadded, filter, mode=\"valid\")\n plt.plot(loss[:, 0], lossFiltered, 'b', label='train error')\n xloss = list(loss[:, 0])\n countRows.append(xloss)\n plt.ylim(0, 0.20)\n plt.hold(True)\n\ncountRows = sum(countRows, [])\nprint(len(countRows))\nnewArr = np.zeros(len(countRows), dtype=np.float32)\nfor loss in list_values:\n newArr[count-1] = loss\n count = count + 5000\n#plt.plot(countRows, newArr, 'g', label='test error')\n# plt.legend()\nplt.show()\n\n\n# # Extra code for general purpose\n\n#%%\n\nfo = open('/misc/lmbraid18/bharadwk/workspace/ws1/new_train_list.txt', 'r')\nhdr_files = fo.readlines()\n# glob.glob('/misc/lmbraid18/bharadwk/hdr_rendered_image/*')\nhdrARR = []\n\nfor items in hdr_files:\n dirName = items.split('/')[7]\n filename = dirName.split('.')[0]\n hdr_path = '/misc/lmbraid18/bharadwk/hdr_rendered_image/%s/hdr_image.exr' % (\n filename)\n hdr_image = cv2.imread(hdr_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH)\n hdrARR.append(hdr_image)\n\nhdr_array = np.array(hdrARR, dtype=np.float32)\nfinHdrARR = hdr_array.ravel()\n#m3 = np.hstack((m1,m2))\n#axes = plt.gca()\n# axes.set_ylim([0,100])\n# axes.set_xlim([0,10000])\nfig = plt.figure()\nmu = np.mean(finHdrARR)\nsigma = np.std(finHdrARR)\nnum_bins = 50\n# the histogram of the data\nn, bins, patches = plt.hist(finHdrARR, num_bins, facecolor='green')\n# add a 'best fit' line\ny = mlab.normpdf(bins, mu, sigma)\nplt.title(\"High Dynamic Range Data frequency\")\nplt.plot(bins, y, 'r--')\nplt.xlabel('Range')\nplt.ylabel('Frequency')", "original_comment": "# Tweak spacing to prevent clipping of ylabel\n", "target_code": "plt.subplots_adjust(left=0.15)\n", "project_metadata": {"full_name": "kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN", "description": "Predict HDR images from LDR images using CNN", "topics": [], "git_url": "git://github.com/kangkanbharadwaj/High-Dynamic-Range-imaging-using-CNN.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2017-07-10T10:31:45Z", "size": 16499, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 96258, "Python": 44059, "MATLAB": 26466, "Shell": 15315, "M": 423}, "last_updated": "2020-07-07T08:49:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig = plt.figure()\nax = fig.add_subplot(111)\nax.plot(finHdrARR)\nax.set_xlabel('Range')\nax.set_ylabel('Frequency')\nax.set_title(\"High Dynamic Range Data frequency\")\nplt.show()\n", "model": "natural", "intent": "# Tweak spacing to prevent clipping of ylabel"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n\ndf = df_orig.copy()\n\n\ndf.info()\n\n\ndf.head(5)\n\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n\n# Use function to add in indicators for presence of null values\n\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n\nfloat(9.00000).is_integer()\n\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n\n\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n\n# ?np.where\n\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n\ndf.head(5)\n\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n\ndf['MasVnrType'].value_counts()\n\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n\ndf[(df.BsmtQual_missing == True)]\n\n\ndf.BsmtCond.value_counts()\n\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n\ndf.Electrical.value_counts()\n\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n\n# ## 2. Create additional bespoke data features\n\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n\ndf['Functional'].value_counts()\n\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n\ndf['PoolQC'].value_counts()\n\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n\ndf['Condition1'].value_counts()\n\n\ndf['Condition2'].value_counts()\n\n\ndf['Condition1']\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n\ndf.head(10)\n\n\n# ***\n# ## 4. Set up target encoding parameters\n\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_oe.head(5)\n\n\n# ***\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n#%%\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n#%%\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n#%%\n\ndf = df_orig.copy()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(5)\n\n#%%\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n#%%\n\n# Use function to add in indicators for presence of null values\n\n#%%\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n#%%\n\ndf = denote_null_values(df)\n\n#%%\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n#%%\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n#%%\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n#%%\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n#%%\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n#%%\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n#%%\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n#%%\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n#%%\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n#%%\n\nfloat(9.00000).is_integer()\n\n#%%\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n#%%\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n\n\n#%%\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n#%%\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n#%%\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n#%%\n\n# ?np.where\n\n#%%\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n#%%\n\ndf.head(5)\n\n#%%\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n#%%\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n#%%\n\ndf['MasVnrType'].value_counts()\n\n#%%\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n#%%\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n#%%\n\ndf[(df.BsmtQual_missing == True)]\n\n#%%\n\ndf.BsmtCond.value_counts()\n\n#%%\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n#%%\n\ndf.Electrical.value_counts()\n\n#%%\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n#%%\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n#%%\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n#%%\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n#%%\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n#%%\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n#%%\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n#%%\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n#%%\n\n# ## 2. Create additional bespoke data features\n\n#%%\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n#%%\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n#%%\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n#%%\n\ndf['Functional'].value_counts()\n\n#%%\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n#%%\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n#%%\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n#%%\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n#%%\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n#%%\n\ndf['PoolQC'].value_counts()\n\n#%%\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n#%%\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n#%%\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n#%%\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n#%%\n\ndf['Condition1'].value_counts()\n\n#%%\n\ndf['Condition2'].value_counts()\n\n#%%\n\ndf['Condition1']\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n#%%\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n#%%\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n#%%\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n#%%\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n#%%\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n#%%\n\ndf.head(10)\n\n#%%\n\n# ***\n# ## 4. Set up target encoding parameters\n\n#%%\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n#%%\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n#%%\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n#%%\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_oe.head(5)\n\n\n# ***", "original_comment": "# ## 5. Set up OneHot encoding parameters\n", "target_code": "onehot_enc = ce.OneHotEncoder(verbose=1, cols=[\n 'Street', 'Alley', 'CentralAir', 'MiscFeature'], use_cat_names=True)\nonehot_enc.get_params()\n", "project_metadata": {"full_name": "JonathanBechtel/DAT-10-19", "description": "GitHub Repo For DAT 10-19", "topics": [], "git_url": "git://github.com/JonathanBechtel/DAT-10-19.git", "stars": 2, "watchers": 2, "forks": 11, "created": "2020-10-19T14:53:15Z", "size": 108252, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 72671490, "HTML": 915086, "Python": 92446, "Shell": 222}, "last_updated": "2021-01-06T23:37:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_oe = pd.DataFrame(df_oe)\ndf_oe.head(5)\n", "model": "no-comments", "intent": "# 5. Set up OneHot encoding parameters"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_moons, make_circles\nimport tensorflow as tf\nimport numpy as np\nimport math\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # NNIA: Tutorial 5 - 12.12.2017\n\n# ---\n\n#%%", "original_comment": "# Set notebook to full width\n", "target_code": "from IPython.core.display import display, HTML\ndisplay(HTML(\"\"))\n", "project_metadata": {"full_name": "mmarius/nnia-tutorial", "description": "Repository for my tutorial group which is part of the lecture Neural Networks: Implementation and Application", "topics": [], "git_url": "git://github.com/mmarius/nnia-tutorial.git", "stars": 9, "watchers": 9, "forks": 0, "created": "2017-11-02T15:20:51Z", "size": 12430, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1494110}, "last_updated": "2020-05-07T22:34:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "X, y = make_circles(n_samples=1000, noise=0.1, factor=0.5)\n", "model": "no-comments", "intent": "# Set notebook to full width"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---\n\n\n\nif lid.shape[0] > 0:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ###### Reduced Events Table\n#\n# __Description__: Reduced excess rainfall is calculated for each event using a user-specified stormwater removal rate, capacity, and efficiency.\n#\n# __Input__: A JSON file containing the incremental excess rainfall for a suite of events which may have different durations and boundary condition names, and the stormwater removal rate, capacity, and efficiency.\n#\n# __Output__:\n#\n# - The unreduced incremental excess rainfall for each event as a JSON.\n#\n# - The incremental reduced excess rainfall and lateral inflow hydrographs for each event as a JSON.\n#\n# - A metadata file containing the stormwater removal rate, the stormwater capacity, the stormwater efficiency, and the seed of the random number generator.\n#\n# ---\n# ## A. Load Libraries, Parameters, and Data:\n# ### Libraries:\n\n#%%\n\nfrom hydromet import*\nimport hydromet_reduced\nimport sys\nsys.path.append('../../core')\n\n\n# ### Parameters:\n# #### Papermill (site specific):\n# Filenames and paths:\nProject_Area = 'Test' # Project area name\nPluvial_Model = 'P01' # Pluvial model name\nBCN = 'D30' # Domain/boundary condition name\n\npluvial_params = '{0}_{1}_Pluvial_Parameters.xlsx'.format(\n Project_Area, Pluvial_Model)\nrate_column = 'SW Rate (in/30min)'\ncapacity_column = 'SW Capacity (in)'\nefficiency_column = 'SW Efficiency'\n\nroot_dir = pl.Path(os.getcwd())\noutputs_dir = root_dir/'Outputs'\ninputs_dir = root_dir/'Inputs'\npluvial_params_dir = inputs_dir/pluvial_params\n\n\n# Options:\ndisplay_plots = True # Option to display plots\ndisplay_print = True # Option to display print statements\n# ##### Convert all paths to objects:\n\n#%%\n\noutputs_dir = pl.Path(outputs_dir)\npluvial_params_dir = pl.Path(pluvial_params_dir)\n\n\n# ##### Set the base filestem for reading/writing files:\n\n#%%\n\nfilestem = '{0}_{1}_{2}'.format(Project_Area, Pluvial_Model, BCN)\n\n\n# ### Data:\n# #### Stormwater removal rate, capacity, and efficiency:\n\n#%%\n\nrate, maxcap, efficiency = get_stormwater_rate_cap(\n pluvial_params_dir, BCN, rate_column, capacity_column, efficiency_column, display_print)\n\n\n# ##### Adjust stormwater rate and capacity by stormwater efficiency:\n\n#%%\n\nadj_rate, adj_maxcap = adj_stormwater_rate_cap(\n rate, maxcap, efficiency, display_print)\n\n\n# #### Lateral inflow domains:\n\n#%%\n\nlid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n\n\n# #### Excess rainfall data:\n\n#%%\n\nwith open(outputs_dir/'{}.json'.format(filestem)) as f:\n EventsTable = json.load(f)\n\n\n# ##### Extract the durations:\n\n#%%\n\ndurations = list(EventsTable.keys())\nprint('Durations:', durations)\n\n\n# ---\n# ## B. Calculate Reduced Excess Rainfall:\n\n#%%\n\nReducedTable, StormwaterTable, SW_variables = hydromet_reduced.main(\n EventsTable, durations, BCN, rate=adj_rate, maxcap=adj_maxcap, display_print=display_print)\n\n\n# ---", "original_comment": "# ## C. Calculate the Lateral Inflow Hydrographs:\n", "target_code": " ReducedTable = calc_lateral_inflow_hydro(\n lid, ReducedTable, StormwaterTable, durations, BCN, display_print)\n", "project_metadata": {"full_name": "Dewberry/pfra-hydromet", "description": "Tools for developing pluvial (excess rainfall) and fluvial scenarios for probabilistic flood risk analyses", "topics": ["hydrology", "papermill", "montecarlo-simulation"], "git_url": "git://github.com/Dewberry/pfra-hydromet.git", "stars": 11, "watchers": 11, "forks": 12, "created": "2019-04-18T13:04:55Z", "size": 165396, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 59869977, "Python": 186157}, "last_updated": "2020-10-27T14:37:20Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "lid = get_lateral_inflow_domains(pluvial_params_dir, BCN, display_print)\n# ---\n#\n", "model": "natural", "intent": " # C. Calculate the Lateral Inflow Hydrographs:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]\n\n\n\ndef print_hw_parameters(model):\n alpha, beta, gamma = model.params['smoothing_level'], model.params[\n 'smoothing_slope'], model.params['smoothing_seasonal']\n print(\"Holt-Winters parameters:\")\n print(\"Alpha: \", alpha)\n print(\"Beta: \", beta)\n print(\"Gamma: \", gamma)\n\n\nprint(\"Forecasting started...\")\nstart_time = time.time()\n\ntry:\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series forecasting using Holt-Winters\n# ### Import necessary libraries\n\n#%%\n\nimport statsmodels.tsa.holtwinters as hw\nimport matplotlib.pyplot as ma\nimport time\nimport sys\nimport datetime\nimport pandas\nimport numpy\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# ### Load necessary CSV file\n\n#%%\n\ntry:\n ts = pandas.read_csv('../../datasets/srv-1-art-1h.csv')\nexcept:\n print(\"I am unable to connect to read .csv file\", sep=',', header=1)\n\nts.index = pandas.to_datetime(ts['ts'])\n\n# delete unnecessary columns\ndel ts['id']\ndel ts['ts']\ndel ts['min']\ndel ts['max']\ndel ts['sum']\ndel ts['cnt']\ndel ts['p50']\ndel ts['p95']\ndel ts['p99']\n\n# print table info\nts.info()\n\n\n# ### Get values from specified range\n\n#%%\n\nts = ts['2018-06-16':'2018-07-15']\n\n\n# ### Remove possible NA values (by interpolation)\n# NA values are explicitely removed by linear interpolation.\n\n#%%\n\ndef print_values_stats():\n print(\"Zero Values:\\n\", sum([(1 if x == 0 else 0) for x in ts.values]), \"\\n\\nMissing Values:\\n\", ts.isnull(\n ).sum(), \"\\n\\nFilled in Values:\\n\", ts.notnull().sum(), \"\\n\")\n\n\nidx = pandas.date_range(ts.index.min(), ts.index.max(), freq=\"1H\")\nts = ts.reindex(idx, fill_value=None)\nprint(\"Before interpolation:\\n\")\nprint_values_stats()\nts = ts.replace(0, numpy.nan)\nts = ts.interpolate(limit_direction=\"both\")\nprint(\"After interpolation:\\n\")\nprint_values_stats()\n\n\n# ### Plot values\n\n#%%\n\n# Idea: Plot figure now and do not wait on ma.show() at the end of the notebook\nma.ion()\nma.show()\nfig1 = ma.figure(1)\nma.plot(ts, color=\"blue\")\nma.draw()\ntry:\n ma.pause(0.001) # throws NotImplementedError, ignore it\nexcept:\n pass\n\n\n# ### Split time series into train and test series\n# We have decided to split train and test time series by two weeks.\n\n#%%\n\ntrain_data_length = 24*7\nts_train = ts[:train_data_length]\nts_test = ts[train_data_length+1:]", "original_comment": "# ### Fit and predict Time Serie\n", "target_code": " model = hw.ExponentialSmoothing(\n ts_train, seasonal='additive', seasonal_periods=train_data_length-1).fit()\n predictions = model.predict(start=ts_test.index[0], end=ts_test.index[-1])\n", "project_metadata": {"full_name": "CSIRT-MU/QoSForecastLSTM", "description": "An evaluation of QoS forecast methods described in paper Quality of Service Forecasting with LSTM Neural Networks", "topics": ["publication"], "git_url": "git://github.com/CSIRT-MU/QoSForecastLSTM.git", "stars": 4, "watchers": 4, "forks": 2, "created": "2018-09-05T07:37:36Z", "size": 10237, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 16021131}, "last_updated": "2020-03-27T12:49:41Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model = sm.tsa.statespace.SARIMAX(ts_train, order=(\n 0, 1, 0), seasonal_order=(0, 1, 0, 24)).fit()\nprint(\"done in %0.3fs\" % (time.time() - start_time))\n", "model": "no-comments", "intent": " # Fit and predict Time Serie"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n#%%\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import optimizers\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n#%%\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n#%%\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n#%%\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n#%%\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n#%%\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n#%%\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))", "original_comment": "# compling and show model\n", "target_code": "from keras import optimizers\n\nmodel.compile(loss=\"categorical_crossentropy\", optimizer=optimizers.SGD(\n lr=0.0001, momentum=0.9), metrics=[\"accuracy\"])\nmodel.summary()\n", "project_metadata": {"full_name": "WuZhuoran/Plant_Seedlings_Classification", "description": "Kaggle Competition Project as well as ANLY 590 Final Project. Task: Determine the species of a seedling from an image", "topics": [], "git_url": "git://github.com/WuZhuoran/Plant_Seedlings_Classification.git", "stars": 10, "watchers": 10, "forks": 7, "created": "2018-10-31T01:19:27Z", "size": 10167, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2140227, "Python": 31477}, "last_updated": "2020-12-18T16:42:52Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model.compile(loss='categorical_crossentropy',\n optimizer='adam', metrics=['accuracy'])\nmodel.summary()\n", "model": "natural", "intent": "# compling and show model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom sklearn.preprocessing import MinMaxScaler\nimport numpy as np\nimport sklearn\nimport pandas as pd\nimport warnings\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.filterwarnings('ignore')\n\n\n# ## Input Data\n\n# ### Train target\n\n#%%\n\ntrain_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')", "original_comment": "# normalize the format of DATE\n", "target_code": "train_target['Date'] = pd.to_datetime(train_target['Date'])\n", "project_metadata": {"full_name": "Quan-Sun/TADPOLE-ECE5970", "description": "machine learning with biomedical data", "topics": [], "git_url": "git://github.com/Quan-Sun/TADPOLE-ECE5970.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2018-11-16T21:39:24Z", "size": 15650, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5564392}, "last_updated": "2019-04-19T22:32:32Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "test_target = pd.read_csv('../data/TADPOLE_TargetData_test.csv')\n", "model": "no-comments", "intent": "# normalize the format of DATE"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# **Porto Competition**\n#\n# So I've been reading a lot about this Kaggel competitions, and tried to execute a couple of kernels myself, some with good results, others total failures... So I finally decided to join this competition and see how well it goes, and I also decided to stop using my personal laptop and give a try to this kaggle kernels and see how they perform. I will be using this notebook as reference (https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial).\n\n# Anyways, if I find something nice on this kernel I will publish it later (try to get away from novice level!), if not at least I will try to do some feature engineering using this, eventually I will need to execute some portion of the code either in a dedicated kernell or rent some time on AWS.\n#\n# I have three major intentions with this tutorial: (sorry about the typos I will fix them at some point in the future)\n#\n# **1. Data validation Check.** Validation if there is any null, -1 or Nan.\n#\n# **2. Feature Inspection. **Correlation plots, inspect the data.\n#\n# **3. Feature importance** and analysis for implementing the classificaton methods.\n\n# Importing the useful functions, packages and others.\n\n#%%\n\nfrom sklearn.ensemble import ExtraTreesClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom plotly import tools\nimport plotly.plotly as plpl\nimport plotly.graph_objs as go\nfrom collections import Counter\nimport seaborn as sns\nimport warnings\nimport plotly.tools as tls\nimport plotly.offline as py\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\npy.init_notebook_mode(connected=True)\n\n# Try ploty libraries\n\nplt.style.use('fivethirtyeight')\n\nwarnings.filterwarnings('ignore')\n\n\n# from subprocess import check_output\n# print(check_output([\"ls\", \"../input\"]).decode(\"utf8\"))\n\n\n# Some data visualization, first see what we got and then we can start cleaning up the dataset.\n\n#%%\n\ntrain = pd.read_csv(\"../input/train.csv\")\ntest = pd.read_csv(\"../input/test.csv\")\ntrain.head(20)\n\n#%%\n\ntest.head()\n\n\n# I like to see some statistical information about the dataset. Since we have a lot of features, it's going to be a lot of information, but if at some point I will use feature engineering I would need to go back here and think about something.\n#\n#\n\n#%%\n\n# train.shape\npd.set_option('precision', 3)\ntrain.describe()\n\n\n# **Part One: Data validation Checks**\n#\n# We can run a simple validation from the dataset just checking if there is any null.****\n\n#%%", "original_comment": "# Check if there is any null information anywhere\n", "target_code": "train.isnull().any().any()\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print(train.shape)\nprint(test.shape)\n", "model": "no-comments", "intent": "# Check if there is any null information anywhere"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n\nfeatures = iris_df.values\n\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n\n\n\n\n# ### Split the data into training and testing data, with random seeding\n\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Classification - Logistic Regression - IRIS problem\n\n# #### Import pandas for importing iris dataset\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\n\n\n# from dataset (iris data) importing 4 features, sepal len & wid, petal len & wid\n\n#%%\n\niris_df = pd.read_csv('iris.csv', skiprows=1, names=[\n 'sepal_len', 'sepal_width', 'petal_len', 'petal_width'], usecols=[0, 1, 2, 3])\n\n\n# Dataframe heads\n\n#%%\n\niris_df.head()\n\n\n# #### From iris data, importing labels (Setosa, versicolor, verginica), masked as 0,1,2 respectively\n\n#%%\n\nlabels_df = pd.read_csv('iris.csv', skiprows=1, names=['Species'], usecols=[4])\n\n#%%\n\nlabels_df.head()\n\n\n# #### Converting dataframe into numpy array using values attribute\n\n#%%\n\nfeatures = iris_df.values\n\n#%%\n\nlabels = labels_df.values.ravel()\n\n\n# The ravel() method returns a flattened (1-Dimensional) NumPy array\n\n# ### Logistic Regression - import\n\n#%%\n\n\n\n#%%\n\n# ### Split the data into training and testing data, with random seeding\n\n#%%\n\nx_train, x_test, y_train, y_test = train_test_split(\n features, labels, test_size=0.30, random_state=2)", "original_comment": "# ### Creating an instance of LogisticRegrssion class\n", "target_code": "from sklearn.linear_model import LogisticRegression\n\nlogReg = LogisticRegression()\n", "project_metadata": {"full_name": "naveen21553/ml-workshop", "description": "Machine Learning Workshop Resources", "topics": [], "git_url": "git://github.com/naveen21553/ml-workshop.git", "stars": 12, "watchers": 12, "forks": 14, "created": "2018-09-28T15:03:08Z", "size": 5274, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 685393, "Python": 11705}, "last_updated": "2020-10-11T10:46:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "logreg = LogisticRegression(solver='lbfgs')\n", "model": "natural", "intent": "# Creating an instance of LogisticRegrssion class"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n\ncombine[0]\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This notebook contains an example dataset to help us get the basic understanding of how series and DataFrame math work in PANDAS\n#\n\n#%%\n\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport sys\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# first we are going to create two series\n\n#%%\n\nnp.random.seed(8)\ns1 = pd.Series(np.random.randn(5))\ns1\n\n#%%\n\ns2 = pd.Series(np.random.randn(5))\ns2\n\n#%%\n\ncombine = pd.concat([s1, s2])\ncombine\n\n\n# as we can see this isn't the cleanest way to combine these two series as if we call 0 we will get both 0 values which could be problematic for analyis\n\n#%%\n\ncombine[0]", "original_comment": "# instead we can reindex:\n", "target_code": "combine.index = range(combine.count())\n", "project_metadata": {"full_name": "ContextLab/CDL-tutorials", "description": "Repo containing useful tutorials on different topics, methods, software tools, and packages used by the CDL", "topics": ["tutorial", "training-materials", "python", "bayesian-methods", "package-creation", "scientific-computing"], "git_url": "git://github.com/ContextLab/CDL-tutorials.git", "stars": 12, "watchers": 12, "forks": 2, "created": "2017-12-15T13:36:50Z", "size": 59045, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 694197, "Python": 17099, "TeX": 9149, "Makefile": 5644, "Batchfile": 5096, "Dockerfile": 3050, "Shell": 128}, "last_updated": "2020-07-13T19:39:57Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "combine.reset_index(inplace=True)\ncombine\n", "model": "natural", "intent": "# we can reindex:"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# This is a script written by Zach Monge to accompany my Medium blog post \"Does Deep Learning Really Require 'Big Data'? -- No!\" (https://medium.com/@zachmonge). To exmplify how deep learning can work quite well on small datasets, I will train a classifier to distinguish between pictures of clown fish and blue damsels. These images were downloaded from Google Image.\n#\n# This script makes use of the deep learning library fastai, which is written on top of PyTorch. I would like to thank the creators of fastai for this amazing deep learning library and for their lessons. During training I used Google Cloud Platform and a K80 GPU.\n\n#%%\n\n# Importing functions\nimport glob\nfrom fastai.plots import *\nfrom fastai.sgdr import *\nfrom fastai.dataset import *\nfrom fastai.model import *\nfrom fastai.conv_learner import *\nfrom fastai.transforms import *\nfrom fastai.imports import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Examining the data\n\n#%%\n\n# Path to data\nPATH = '/home/zachm/fastai_data/fish/'\nos.listdir(PATH)\n\n#%%\n\n# As can be seen, there are two image categories -- damsels and clown fish (titled clown)\nos.listdir(f'{PATH}/train')\n\n#%%\n\n# Number of training items for each category\n[len(os.listdir(f'{PATH}/train/clown')),\n len(os.listdir(f'{PATH}/train/damsel'))]\n\n#%%\n\n# Number of items in the validation set for each category. I typically include about\n# 20% of the items in the validation set, but here I have 50% since this is such a\n# small dataset\n[len(os.listdir(f'{PATH}/valid/clown')),\n len(os.listdir(f'{PATH}/valid/damsel'))]\n\n#%%\n\n# Creating lists of pictures in training set\nclow_train_imgs = glob.glob(f'{PATH}train/clown/*')\ndamsel_train_imgs = glob.glob(f'{PATH}train/damsel/*')\n\n#%%\n\n# Example picture of clownfish\nimg = plt.imread(clow_train_imgs[0])\nplt.imshow(img)\n\n#%%\n\n# Example picture of damsel\nimg = plt.imread(damsel_train_imgs[0])\nplt.imshow(img)\n\n\n# # Training model\n\n# To train this model we will take advantage of a model that was pretrained on a large dataset of images. This dataset is the famous ImageNet in which the training set contains over a million of pictures. The specific model architecure I used was ResNet-34.\n\n#%%\n\n# Specifying the model architecture\narch = resnet34\n\n# Specifying the size the images will be cropped to. I chose 224 because this is the size the model was originially trained on.\n# The GPU is most efficient when all of the images are the same size\nsz = 224", "original_comment": "# Loading in the data.\n", "target_code": "data = ImageClassifierData.from_paths(\n PATH, tfms=tfms_from_model(arch, sz), bs=64)\n", "project_metadata": {"full_name": "zachmonge/fish_computer_vision_example", "description": "This is the repository corresponding to my Medium blog post titled \"Does Deep Learning Really Require 'Big Data'? --No!\"", "topics": [], "git_url": "git://github.com/zachmonge/fish_computer_vision_example.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2018-08-20T03:51:12Z", "size": 8148, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1391423}, "last_updated": "2020-02-13T19:27:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "data = ImageClassifierData.from_paths(PATH, tfms=tfms_from_model(arch, sz))\nlearn = ConvLearner.pretrained(arch, data, precompute=True)\n", "model": "docstring", "intent": "# Loading in the data."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data\n\n\n\ndef MinMaxScaler(data):\n ''' Min Max Normalization\n Parameters\n ----------\n data : numpy.ndarray\n input data to be normalized\n shape: [Batch size, dimension]\n Returns\n ----------\n data : numpy.ndarry\n normalized data\n shape: [Batch size, dimension]\n References\n ----------\n .. [1] http://sebastianraschka.com/Articles/2014_about_feature_scaling.html\n '''\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n#%%\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n#%%\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data", "original_comment": "# ### MinMax Scaling\n", "target_code": " import numpy as np\n\n numerator = data - np.min(data, 0)\n denominator = np.max(data, 0) - np.min(data, 0)\n return numerator / (denominator + 1e-7)\n", "project_metadata": {"full_name": "jwlee-ml/TensorFlow_Training_13th", "description": "Tensorflow\ub85c \uc2dc\uc791\ud558\ub294 \ub525\ub7ec\ub2dd Camp 13\uae30 \uc2e4\uc2b5", "topics": [], "git_url": "git://github.com/jwlee-ml/TensorFlow_Training_13th.git", "stars": 4, "watchers": 4, "forks": 5, "created": "2019-06-14T14:39:05Z", "size": 23519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23325250}, "last_updated": "2019-11-05T13:31:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "scaler = MinMaxScaler(data)\ndata = scaler.fit_transform(data)\n", "model": "docstring", "intent": " # MinMax Scaling"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)", "original_comment": "# Add the title\n", "target_code": "plt.title('Fuel efficiency vs Horse-power')\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.title('Scatter plot')\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n", "model": "docstring", "intent": "# Add the title"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n\n# ## Data\n\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n\n# DATA_DF.content[0]\n\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n#%%\n\n# ## Data\n\n#%%\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n#%%\n\n# DATA_DF.content[0]\n\n#%%\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n#%%", "original_comment": "# remove reviews without audio features from Spotify\n", "target_code": "DATA_DF = DATA_DF.loc[~DATA_DF.audio_features.isna()]\n", "project_metadata": {"full_name": "iconix/openai", "description": "OpenAI Scholar, general materials", "topics": [], "git_url": "git://github.com/iconix/openai.git", "stars": 16, "watchers": 16, "forks": 3, "created": "2018-11-02T19:26:13Z", "size": 69033, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 22113154, "Python": 46353, "JavaScript": 8783, "Shell": 2297, "HTML": 970}, "last_updated": "2020-06-01T14:04:53Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "DATA_DF = DATA_DF[DATA_DF.content.str.len() > 0]\nlen(DATA_DF)\n", "model": "natural", "intent": "# remove reviews without audio features from Spotify"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n\ndf.columns\n\n\ndf.describe()\n\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n\ndf.head()\n\n\ndf.isnull().any()\n\n\n\nvariables = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',\n 'Ticket', 'Fare', 'Embarked', 'Survived', 'Initial']\n# Calculate the correlations\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n#%%\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n#%%\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n#%%\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n#%%\n\ndf.columns\n\n#%%\n\ndf.describe()\n\n#%%\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.isnull().any()", "original_comment": "# # Correlation between variables\n", "target_code": "corr_mat = df[variables].corr().round(2)\n", "project_metadata": {"full_name": "ghimireadarsh/AI-WinterSchool", "description": "Comprises of various lecture slides, papers, practical notebooks used during AI Winter school, organized by NAAMII at Pokhara, Nepal from December 10, 2019 to December 20, 2019. ", "topics": [], "git_url": "git://github.com/ghimireadarsh/AI-WinterSchool.git", "stars": 6, "watchers": 6, "forks": 6, "created": "2019-12-14T18:16:09Z", "size": 75918, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1041087, "HTML": 666537, "Python": 20395}, "last_updated": "2020-09-27T21:32:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "corr = df[variables].corr()\nsns.heatmap(corr)\nplt.show()\n", "model": "docstring", "intent": "# Correlation between variables"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n\nX_train.shape, Y_train.shape\n\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Time series prediction with multimodal distributions - Building Mixture Density Network with Keras and Tensorflow Probability\n\n# The two most common neural network problems are regression and classification. One of the major differences between the two is that classification outputs the probability of a given class, while regression outputs the value of the predicted variable without any information about the uncertainty of the forecast. Even classification models output only rigid numbers, not distributions, but most of the time this approach is satisfactory to estimate the uncertainty of the prediction. Usually, we want something like \"class B has a chance of 0.73\", and not something like \"according to our fitted normal distribution there is 60% chance, that the chance of class B is between 0.63 and 0.8\".\n#\n# To address this problem we can use Monte Carlo Dropout, here you can find a very good explanation: [link](https://www.depends-on-the-definition.com/model-uncertainty-in-deep-learning-with-monte-carlo-dropout/).\n# Monte Carlo Dropout can be a good choice in some cases, but I will show an example, where this technique won't really improve our forecast, because the typical loss functions (mostly MSE) used in regression will always tend to center the output around the mean of the distribution, and can't capture multimodal phenomenons.\n#\n# Recently I started to explore [Tensorflow Probability](https://www.tensorflow.org/probability), a library built on Tensorflow, which enables us to estimate the aleatoric uncertainty (known unknowns) and epistemic uncertainty (unknown unknowns) of our model and data. [This article](https://blog.tensorflow.org/2019/03/regression-with-probabilistic-layers-in.html) gives a really good basic idea about the potential of this library to estimate model uncertainty, but Tensorflow Probability has much more use cases beyond neural networks.\n#\n# In this article, I will focus on the estimation of the known unknowns. Using Tensorflow Probability I will build an LSTM based time-series forecaster model, which can predict uncertainty and capture multimodal patterns if it exists in the data. These types of networks are called Mixture Density Networks.\n\n#%%\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nimport os\nimport pandas as pd\nimport sklearn\nimport sys\nimport tensorflow as tf\nimport tensorflow_probability as tfp\nfrom tensorflow_probability import distributions as tfd\n\nfrom tensorflow import keras\nimport time\n\n#%%\n\nprint(\"python\", sys.version)\nfor module in mpl, np, pd, sklearn, tf, keras:\n print(module.__name__, module.__version__)\n\n#%%\n\nassert sys.version_info >= (3, 5) # Python \u22653.5 required\nassert tf.__version__ >= \"2.0\" # TensorFlow \u22652.0 required\n\n\n# ## The Dataset\n\n#%%\n\ndef generate_time_series(batch_size, n_in, n_out):\n \"\"\"\n Generate a batch of time-series, where the target part is randomly\n goes up or down\n \"\"\"\n t = np.arange(n_in + n_out)\n series = np.zeros((batch_size, n_in + n_out))\n series += 0.1 * (np.random.normal(0, 0.5, (batch_size, n_in + n_out)))\n series[:, -n_out:] += (np.arange(n_out)) * 0.05\n # randomize the output part\n sign = (np.random.choice([-1, 1], batch_size, p=[0.35, 0.65]))\n series[:, -n_out:] *= sign[..., np.newaxis]\n return series[..., np.newaxis].astype(np.float32)\n\n#%%\n\n# generate the time-series, train, valid and test set\n\nnp.random.seed(42)\n\nn_in = 30 # the length of the input part\nn_out = 10 # the length of the output (forecasted) part\nseries = generate_time_series(5000, n_in, n_out)\nprint(series.shape)\nX_train, Y_train = series[:3000, :n_in], series[:3000, n_in:]\nX_valid, Y_valid = series[3000:4000, :n_in], series[3000:4000, n_in:]\nX_test, Y_test = series[4000:, :n_in], series[4000:, n_in:]\nY_train = np.squeeze(Y_train)\nY_valid = np.squeeze(Y_valid)\nY_test = np.squeeze(Y_test)\n\n#%%\n\nX_train.shape, Y_train.shape\n\n#%%\n\ndef plot_multiple_forecasts(X, Y, Y_pred=None, title=\"\"):\n n_in = X.shape[1]\n n_out = Y.shape[1]\n fig = plt.figure(figsize=(10, 6))\n plt.xlabel(\"$t$\", fontsize=16)\n plt.ylabel(\"$x(t)$\", fontsize=16)\n plt.title(title, fontsize=16)\n for i in range(20):\n plt.plot(X[i, :, 0])\n plt.plot(np.arange(n_in, n_in + n_out), Y[i, :], \"x-\")\n if(Y_pred is not None):\n plt.plot(np.arange(n_in, n_in + n_out),\n Y_pred[i, :], \"+-\", markersize=10)\n\n\n# In the chart below we can see the shape of our series. I wanted to use as simple data as possible to show some pitfalls of non-probabilistic models. Instead of a continuous time-series, I generated a batch of samples with the same patterns. With this data, it is easier to show the behavior of our forecast. The input data (X) is a 30 steps series without any pattern or slope, it is only white noise. The target part (Y) goes up with a 65% chance and goes down with a 35% chance, and has some noise as well.\n#\n# It is easy to recognize the bimodal nature of the target forecast steps by humans, and it is noticeable that the up-trend is more common than the down-trend. If we would stick one sample to the end of the other and would make a common continuous time-series it would be harder to recognize this bimodal nature of the series, and in case of real data, we are rarely able to recognize similar patterns. With neural networks, our input and output space can have multiple dimensions. Multi-dimension datasets make it even harder or impossible to catch potential multinomial divergences looking at simple analysis charts, and these divergences can be very hard to be found even with very careful and extensive examinations. But the power of neural networks can help us here if we build the appropriate model.\n#\n# Bimodal or multimodal patterns aren't so rare that we should neglect them all the time. Some example where this kind of pattern can occur:\n#\n# - Financial time-series at regular economic news can go up and down based on the surprise of the incoming data. As far as we don't know the side of the surprise (if the economic news is better or worse than we expected), the movement of the price will have bimodal distribution based on our knowledge.\n# - Peak traffic hours or restaurant hours, or a lot of other things in our timetable.\n# - Daily average precipitation during the year in a large part of the world.\n#\n# These are obvious examples, not hard to show on a histogram, but neural networks can be able to find \"latent\" multimodality, because of their power in pattern recognition.\n#\n# Among the examples above the first example deserves more regard. Our historical series obviously will be the same regardless of our input data. But the distribution of the forecast and the modality of the forecasted distribution will depend on our prior knowledge - our input data.\n#\n# Here I make some assumptions about the possible forecasted distributions to show how important can be our prior knowledge, and how it can alter our posterior distribution, but I have to stress that these are only my actual assumptions. I will examine in an upcoming article if the forecast distributions really behave this way or not.\n#\n# In our thought experience, we use the USD/JPY pair, which in my experience is very sensitive to regular economic news outcomes. But what is a surprise in economic news term? Before the regular economic news or indicators are released, there is a consensus or estimation of the expected indicator number. The consensus number is the general agreement of experts on the outcome of the number. When the real indicator about inflation, GDP, Non-Farm Payroll or other official data comes out it is usually larger or smaller than the earlier consensus. Depending on the deviation from the consensus this can be a smaller or bigger surprise, and big surprises usually affect the price movement.\n#\n# Let's distinguish three different priors:\n#\n# - We know only the earlier movement of the price.\n# - We know the earlier movement of the price and the time of economic news.\n# - We know the earlier movement of the price, the time of economic news and the surprise factor.\n#\n# In the first case, we know nothing about the news. Our model sees only the earlier price movement, and one step before the economic news the model will be blind to the possible up or down jump caused by the surprise. This model doesn't know that the next step can have large up or down jumpy. This model will probably expect some more symmetric normal-like outcome even if capable to forecast a multimodal distribution.\n# In the second case, our model knows the time of the news, but not its surprise factor. A model trained on this dataset will probably know one step before the news that a big jump can come, but not the direction of the jump. This model will most likely forecast a bimodal distribution, probably with peaks of different heights based on our price and news time history.\n# In the third case, we know the time and the surprise of the news as well. Of course, this isn't possible before the time of the news. This knowledge will most probably reduce one peak of our bimodal distribution, as the model knows the historical effect of this side of the surprise, and most probably will forecast a more unimodal distribution.\n#\n# These conclusions are traceable by humans, but a very high dimension dataset can hide connections or patterns from us, but not necessarily from a neural network.\n#\n# Ok, let's go back to our basic example, to see how can we implement a model capable to forecast our peaks with Keras and Tensorflow Probability.\n\n#%%\n\nplot_multiple_forecasts(X_train, Y_train, Y_pred=None,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Forecasting with simple regression\n\n# To demonstrate the inability of the most common regression models to recognize bimodal patterns I build a simple LSTM model. The model complexity here doesn't matter. With a better model we can be able to predict more accurately the mean of the possible future paths, but not more. The problem is that in some datasets there is a chance, that the mean path will never happen. Unfortunately with non-probabilistic approaches, we can't do better, and even Dropout Monte Carlo will fail.\n\n#%%\n\ndef simple_conv_and_lstm_layout(window_size, targetsize):\n \"\"\"\n Define a simple LSTM layout\n \"\"\"\n keras.backend.clear_session()\n np.random.seed(42)\n tf.random.set_seed(42)\n\n model = keras.models.Sequential([\n keras.layers.LSTM(20, return_sequences=True,\n input_shape=[window_size, 1]),\n # for a simpler Y_valid tensor we don't use return_sequences=True in this notebook\n keras.layers.LSTM(20, return_sequences=False),\n keras.layers.Dense(targetsize)\n ])\n return model\n\n#%%\n\nmodel_lstm_ts10 = simple_conv_and_lstm_layout(window_size=30, targetsize=10)\nkeras.utils.plot_model(\n model_lstm_ts10, \"model_lstm_ts10.png\", show_shapes=True)\n\n#%%\n\n# this pattern is very easy, 3 epochs is enough\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_lstm_ts10.compile(loss=\"mse\", optimizer=optimizer)\nhistory = model_lstm_ts10.fit(X_train, Y_train, epochs=3,\n validation_data=(X_valid, Y_valid))\n\n#%%\n\nmodel_lstm_ts10.evaluate(X_test, Y_test)\n\n\n# In the graph below we can see that the model did a pretty good job if our only concern is the mean squared error and we are satisfied with the estimation of the mean of the possible paths. The real paths are denoted by \"x\", and the forecast paths by the \"+\" sign. 65% of our real paths go up, 35% go down. The forecast is an up-trend between the two. This isn't a bad forecast, depending on the problem this can be the estimate what we want.\n#\n# But if the data consists of the GPS coordinates of drones that reached our destination, and we want to send the next drone on the best possible path, then we definitely should avoid these kinds of predictions, as we can easily hit the tree between the roads. Maybe this isn't the best example, but it is obvious that in some cases the mean can be a very improbable point, and we don't want very improbable points to be our forecast.\n\n#%%\n\nY_pred = model_lstm_ts10.predict(X_test)\nplot_multiple_forecasts(X_train, Y_train, Y_pred,\n title=\"Simple Bimodal Series\")\nplt.show()\n\n\n# ## Fitting Unimodal Distribution to the data\n\n# Our artificial data have very similar distribution at every future step. The added noise has the same variance, only the means of the peaks are further from zero. I will examine the 6th step (index=5), the other steps have similar properties.\n#\n# First, we fit a normal distribution to the 6th forecast step. In the graph below we can see that this distribution how badly represents our data. As we fitted this distribution to the data itself, this is the best guess we can hope from a unimodal normal.\n\n#%%\n\n# build normal distribution fitted to the 6th forecast step\nnd_test = tfd.Normal(loc=np.mean(Y_test[:, 5]), scale=np.std(Y_test[:, 5]))\n\n#%%\n\n# check the statistics of the 6th forecast step\nprint(np.mean(Y_test[:, 5]))\nprint(np.std(Y_test[:, 5]))\n\n#%%\n\nlower = -0.75\nupper = 0.75\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, nd_test.prob(x).numpy(), lw=5, label=\"Fitted Gaussian Distribution\")\n_ = ax.legend()\n\n\n# ## Fitting Bimodal Distribution to the data\n\n# Instead of a unimodal Gaussian, we can try to fit a bimodal Gaussian. Since our artificial data is well separated, it isn't hard to build a distribution model close to the real one.\n\n#%%\n\n# separate the positive and negative branches\nY_test_neg_5 = Y_test[:, 5][Y_test[:, 5] < 0]\nY_test_pos_5 = Y_test[:, 5][Y_test[:, 5] >= 0]\n\n#%%\n\nprint(np.mean(Y_test_neg_5))\nprint(np.mean(Y_test_pos_5))\nprint(np.std(Y_test_neg_5))\nprint(np.std(Y_test_pos_5))\nprint(Y_test_neg_5.shape[0])\nprint(Y_test_pos_5.shape[0])\n\n\n# We estimate the weights of the distributions from the occurrence of the negative or positive paths and calculate the means and standard deviations of the positive and negative samples. With the MixtureSameFamily class, it is very easy to build the mixture distribution that well fits our data, and it would be awesome if we could forecast that distribution with a neural network.\n#\n# As you have foreseen we can do that :) These networks are called Mixture Density Networks, and here you can read an awesome article about the math behind them: [link](https://towardsdatascience.com/a-hitchhikers-guide-to-mixture-density-networks-76b435826cca)\n# (I borrowed the style of the histogram graphs as well, thanks [Olover Borchers](https://towardsdatascience.com/@oliverbor).)\n# In the article above you can check how to implement a mixture density layer yourself. Here I will use the [MixtureNormal layer](https://www.tensorflow.org/probability/api_docs/python/tfp/layers/MixtureNormal) from the Tensorflow Probability library.\n\n#%%\n\n# rebuild the distribution from the data\nweights = [Y_test_neg_5.shape[0], Y_test_pos_5.shape[0]]\nmeans = [np.mean(Y_test_neg_5), np.mean(Y_test_pos_5)]\nsigmas = [np.std(Y_test_neg_5), np.std(Y_test_pos_5)]\n\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=weights),\n components_distribution=tfd.Normal(\n loc=means,\n scale=sigmas))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\n\nax.set_xlabel(\"Y_test elemnt-index=5 distribution\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## The Mixture Density Network\n\n# This mixture density network will use the MixtureNormal layer, but the other parts of the network are very similar to the non-probabilistic network we used earlier. There are two main differences. Instead of the Dense layer, we use a MixtureNormal layer. The LSTM layer before the MixtureNormal layer needs to have the proper number of neurons to satisfy the needs of the MixtureNormal, and I set the activation to \"None\" because constraints of the default \"tanh\" are too restrictive to the MixtureNormal parameters.\n#\n# With real datasets, we don't know how many peaks our distributions can have, and the number of submodels can change depending on the input and the forecast step. Pretending that we don't know the number of peaks we set the number of component distributions to 3.\n#\n# The parameter size for the MixturNormal layer can be calculated easily. We have (3 components) * (10 steps) * (2 parameters of the Normal distributions) + 3 weight of the components = 63, but it is safer to calculate it in the following way.\n\n#%%\n\n# define the output distribution parameters\n# Number of components in the mixture (2 would be optional, but most of the time we don't know)\nnum_components = 3\nevent_shape = [10] # shape of the target (10 steps)\n# calculate the required input size for the mixture layer\nparams_size = tfp.layers.MixtureNormal.params_size(num_components, event_shape)\nprint(params_size)\n\n#%%\n\n# network layout with mixtureNormal layer\n\ninputs = keras.layers.Input(shape=(30, 1))\nh1 = keras.layers.LSTM(20, return_sequences=True)(inputs)\nh2 = keras.layers.LSTM(params_size, return_sequences=False,\n activation=None)(h1) # !!! params_size\ndl = tfp.layers.MixtureNormal(num_components, event_shape)(h2)\n\nmodel_mx = keras.models.Model(inputs=[inputs], outputs=[dl])\n\n\n# In this model, the weights of the components doesn't change from one forecast step to the other, but it is possible to make weights with more dimensions. With our 3 components and 10 forecast step we could have a (3, 10) shaped tensor as our component weight.\n\n#%%\n\nkeras.utils.plot_model(model_mx, \"model_mx.png\", show_shapes=True)\n\n\n# We can estimate how probable is our data given our distribution. Log probabilities are more practical for computations. Negative log probabilities give us the loss functions we want to minimize. This loss function is very simple to implement when the output of our model is a Tensorflow distribution object.\n\n#%%\n\n# loss function for distributions\ndef negloglik(y, rv_y): return -rv_y.log_prob(y)\n\n#%%\n\n# Let's Rock and Roll!\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\n\n\n# In our dataset every example is very similar to the other, the difference is only the noise, so we will examine only the first example from the test set.\n\n#%%\n\n# Our inputs are very similar, so we use only the first element to forecast our distribution\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\n\n\n# Our distribution consists of different submodules. The parameters of these submodules are our forecasted variables.\n\n#%%\n\n# the components of our mixture model\nyhat.submodules\n\n\n# One of our submodules describes the (3, 10) normal distributions we fitted to our data. We will check the 6th steps as we did earlier. We can see that the first two means are very close to our real component distribution means, and the third is close to zero.\n\n#%%\n\n# the Normal component mean for the 6th step\nyhat_means = yhat.submodules[2].mean().numpy()\nprint(yhat_means.shape)\nprint(yhat_means[:, :, 5])\n\n\n# The other submodule is the Categorical distribution submodel. This submodule contains the weights of the components [0.29453883 0.6899422 0.01551905]. The first two weights are close to our 35% and 65%, and the third is practically negligible. The model was able to recognize that we have only two real components.\n\n#%%\n\n# the categorical component weights, with our model this is the same for all step,\n# but could have the same dimension as the Normal mean\nyhat_cw = yhat.submodules[1].probs_parameter().numpy()\nprint(yhat_cw.shape)\nprint(yhat_cw)\n\n\n# In the graph below the line-widths are determined by the component weights. As we expected the upper trend is stronger, but the lower trend is apparent as well, the third component is almost invisible.\n\n#%%\n\n# the forecasted mean weighted by the weight of the corresponding distribution\nplt.plot(yhat_means[0, 0, :], linewidth=yhat_cw[0, 0]*10)\nplt.plot(yhat_means[0, 1, :], linewidth=yhat_cw[0, 1]*10)\nplt.plot(yhat_means[0, 2, :], linewidth=yhat_cw[0, 2]*10)\n\n\n# The components with larger weights have small standard deviations, but the third component has relatively large. Along with its small weight, this further confirms that our third component is redundant. If we face such a component we should consider dropping it, or retrain our model with fewer components.\n\n#%%\n\n# the standard deviations of the 6th step\nyhat_stddev = yhat.submodules[2].stddev().numpy()\nyhat_stddev[:, :, 5]\n\n\n# Next, we will rebuild the forecasted distribution of the 6th step and compare it to the real distribution of the test set. The forecasted distribution fits well the data. Tunning the model probably could result in an even better fit.\n\n#%%\n\n# rebuild the distribution of the 6th step from the forecasted data\ngm_test = tfd.MixtureSameFamily(\n mixture_distribution=tfd.Categorical(probs=yhat_cw[0]),\n components_distribution=tfd.Normal(\n loc=yhat_means[0, :, 5],\n scale=yhat_stddev[0, :, 5]))\n\n#%%\n\nf = plt.figure(figsize=(10, 4))\nax = plt.gca()\nsns.distplot(Y_test[:, 5], bins=50, kde=False,\n norm_hist=True, ax=ax, label=\"Histogram\")\nax.set_xlim(lower, upper)\n\nx = np.linspace(upper, lower, int(1e4), dtype=np.float32)\nax.plot(x, gm_test.prob(x).numpy(), lw=5, label=\"Gaussian Mixture\")\n_ = ax.legend()\n\n\n# ## Probabilistic forecast visualization\n\n# With non-probabilistic neural networks, we get only one number for a variable. With probabilistic models we can get as many random forecast scenarios as we want, we can examine the mean of the distribution which is comparable to the non-probabilistic result, and we can examine the submodule means of a multinomial case. This can be seen in the figure below. We didn't drop our underweighted submodule, and because of that, we got some very random forecast paths.\n\n#%%", "original_comment": "# sample from the forecasted distribution\n", "target_code": "smpl = yhat.sample(100).numpy()\n", "project_metadata": {"full_name": "sinusgamma/multimodal_network", "description": "Mixture Density Network with Tensorflow Probability. Demonstrate the usefulness of multi-modal distribution outputs for neural networks.", "topics": [], "git_url": "git://github.com/sinusgamma/multimodal_network.git", "stars": 11, "watchers": 11, "forks": 0, "created": "2020-03-08T10:08:43Z", "size": 3194, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1217660}, "last_updated": "2021-01-04T15:29:04Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "def negloglik(y, rv_y): return -rv_y.log_prob(y)\noptimizer = keras.optimizers.Adam(lr=0.001)\nmodel_mx.compile(loss=negloglik, optimizer=optimizer)\nhistory = model_mx.fit(X_train, Y_train, epochs=20,\n validation_data=(X_valid, Y_valid))\nyhat = model_mx(X_test[:1, :, :])\nassert isinstance(yhat, tfd.Distribution)\nyhat.submodules\n", "model": "no-comments", "intent": "# sample from the forecasted distribution"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Code examples for storing notebooks and data in a git repository\n\n#%%\n\nimport os\nimport getpass\nimport subprocess\n\nfrom __future__ import print_function\n\n\n# ### Repository information and account credentials\n\n#%%\n\n# Github repository\ngithub_repo = \"github.com/hluetck/jupyter_notebooks.git\"\n# Local path for the repository\nrepo_path = \"/Users/Henry/Data/temp/test/jupyter_notebooks\"\n# Github account credentials\ngithub_user = 'hluetck'\ngithub_password = getpass.getpass()\n\n\n# ### Clone an exisiting repository\n\n#%%\n\n# check if repo_path exists already\nif os.path.isdir(repo_path):\n print(\"Directory %s exists. Please delete it before cloning!\" % (repo_path))\n# build command for cloning\nclone_command = \"git clone https://\" + github_user + \":\" + \\\n github_password + \"@\" + github_repo + \" \" + repo_path\n\n#%%\n\n# clone repository\nreturn_code = subprocess.call([clone_command], shell=True)\nif not return_code:\n print(\"Successfully cloned into %s\" % (repo_path))\nelse:\n print(\"Cloning failed. Maybe check user name and password?\")\n\n\n# ### Commit changes and push to Github\n\n#%%\n\n# get current working directory\ncwd = os.getcwd()\n# change to the repository folder\nos.chdir(repo_path)", "original_comment": "# print git status for the local repository\n", "target_code": "status_command = \"git status\"\noutput = subprocess.check_output(status_command, shell=True)\nprint(output)\n", "project_metadata": {"full_name": "uzh/helmchen-spark", "description": "Playbooks and other files to build a (virtual) Spark cluster for Prof. Helmchen's research group", "topics": [], "git_url": "git://github.com/uzh/helmchen-spark.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2016-03-23T21:54:52Z", "size": 6519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2819538, "Python": 37375, "Shell": 3482}, "last_updated": "2019-12-15T16:09:17Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "get_ipython().run_cell_magic('bash', '-s \"$repo_path\" \"$github_user\" \"$github_password\"',\n '\\ncd \"$1\"\\n\\nls')\n", "model": "no-comments", "intent": "# print git status for the local repository"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n'''\nLSTM playground\n'''\n\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n\nbatchedInputMatrix.shape\n\n\nbatchedOutputMatrix.shape\n\n\nbatchedInputMatrix[0, 0]\n\n\nbatchedOutputMatrix[0, 0]\n\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n#%%\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n#%%\n\n'''\nLSTM playground\n'''\n\n#%%\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n#%%\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n#%%\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n#%%\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n#%%\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n#%%\n\nbatchedInputMatrix.shape\n\n#%%\n\nbatchedOutputMatrix.shape\n\n#%%\n\nbatchedInputMatrix[0, 0]\n\n#%%\n\nbatchedOutputMatrix[0, 0]\n\n#%%\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n#%%\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n#%%\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n#%%\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))", "original_comment": "# output layer\n", "target_code": "simpleStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n", "project_metadata": {"full_name": "miroenev/teach_DL", "description": null, "topics": [], "git_url": "git://github.com/miroenev/teach_DL.git", "stars": 36, "watchers": 36, "forks": 15, "created": "2017-07-19T18:01:29Z", "size": 98182, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12259106, "Python": 43930, "Dockerfile": 2478, "Shell": 1713}, "last_updated": "2020-09-04T16:13:54Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "precision": "Agree", "precision-score": 2, "coverage": "Disagree", "coverage-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "simpleNonStatefulModel.add(\n LSTM(10, batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n", "model": "docstring", "intent": "# add output layer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)\nplt.legend()\nplt.savefig('lumpsum.png')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# - Erics-MBP-3:Desktop Eric\\$ mkdir dca-ls-analysis\n# - Erics-MBP-3:Desktop Eric\\$ cd dca-ls-analysis\n# - Erics-MBP-3:dca-ls-analysis Eric$ virtualenv venv\n\n# ### Imports\n\n#%%\n\nimport pandas as pd\nimport pandas_datareader.data as web\nimport datetime\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport matplotlib.ticker as ticker\nimport numpy as np\n\n\n# ### Pull Data\n# Note: I set up a venv to revert back to Python 2 since DataReader does not work on Python 3\n\n#%%\n\n# Pull data for spy, adj is split & dividend adjusted\nstart_date = datetime.datetime(1995, 6, 19)\nend_date = datetime.datetime(2020, 6, 19)\n#end_date = datetime.datetime(2016, 1, 9)\n\nspy_data = web.DataReader(\"SPY\", \"yahoo\", start_date, end_date)\n\nprint(spy_data.tail()) # See first few rows\n\n\n# ### Define Functions\n\n#%%\n\ndef lumpsum(invest_date, principal=10000):\n invest_price = spy_data.loc[invest_date]['Adj Close']\n current_price = spy_data['Adj Close'][-1]\n\n investment_return = (current_price / invest_price) - 1\n\n return principal*(1+investment_return)\n\n#%%\n\ndef dollar_cost_average(invest_date, periods=12, freq='30D', principal=10000):\n\n # Get DCA dates\n dca_dates = investment_dates_all = pd.date_range(\n invest_date, periods=periods, freq=freq)\n\n # Filter out ones past the last data day\n dca_dates = dca_dates[dca_dates < spy_data.index[-1]]\n\n # Figure out how many dates we cut off\n cut_off_count = 12 - len(dca_dates)\n\n # Amount you have in cash and not the market\n value = cut_off_count*(principal/periods)\n\n for date in dca_dates:\n # Get an actual trading day\n trading_date = spy_data.index[spy_data.index.searchsorted(date)]\n\n # Calculate lumpsum value if invested on that date, add to value\n value += lumpsum(trading_date, principal=principal/periods)\n\n return value\n\n\n# ### Analysis\n\n#%%\n\n# Plot SPY\nspy_price = spy_data['Adj Close']\n\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, spy_price, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Adjusted SPY Price', size=18)\nax.set_ylabel('Price ($)', size=14)\nax.set_xlabel('Date', size=14)\n\n\nplt.legend()\nplt.savefig('spy_chart.png')\nplt.show()\n\n#%%\n\n# Lump Sum\n\n# Simulate Lump Sum Investing\nlump_sum = [lumpsum(x) for x in spy_data.index]\n\n# Format and plots\nsns.set_style(\"whitegrid\")\nplt.figure(figsize=(15, 6))\nfig, ax = plt.subplots()\n\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\n\n# Labels\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size=14)", "original_comment": "# Show with Legend\n", "target_code": "plt.show()\n", "project_metadata": {"full_name": "eonofrey/DollarCostAverage_vs._LumpSum", "description": "Comparing dollar cost averaging vs. lump sum investment in the SPY ", "topics": [], "git_url": "git://github.com/eonofrey/DollarCostAverage_vs._LumpSum.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-06-19T21:58:51Z", "size": 1525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 587938}, "last_updated": "2020-12-19T01:53:56Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig, ax = plt.subplots()\n# Style and size\nsns.set_style(\"whitegrid\")\nfig.set_size_inches(15, 7)\n# Plot Series\nax.plot(spy_data.index, lump_sum, color='black')\n# Set Y axis format\ntick = ticker.StrMethodFormatter('${x:,.0f}')\nax.yaxis.set_major_formatter(tick)\nax.set_title('Lump Sum Value Today', size=18)\nax.set_ylabel('Current Value ($)', size=14)\nax.set_xlabel('Date of Investment', size\n", "model": "natural", "intent": "# Show with Legend"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Nonparametric statistical significance tests\n#\n# non_parametric_tests_assign_solution.ipynb\n#\n# Complete the assignment below.\n#\n# References:\n# - Nonparametric Statistics for Non-Statisticians: A Step-by-Step Approach, 2009.\n# - How to Calculate Nonparametric Statistical Hypothesis Tests in Python, Jason Brownlee, 2018.\n#\n\n# ## Assignment\n#\n# ### Exercise 1\n#\n# Our first dataset includes a very famous dataset, the Winsconsin Breast cancer dataset which is also available in https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). It includes features for cell nuclei for two categories of tumors: malignant and benign. As explained in the original data source:\n#\n# >Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.\n# n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: \"Robust Linear Programming Discrimination of Two Linearly Inseparable Sets\", Optimization Methods and Software 1, 1992, 23-34].\n#\n# Dataset:\n# - https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\n#\n# We will be using three of these features for our exercise:\n# - diagnosis: the classification of the tumor with values \"M\" (malignant) and \"B\" (benign).\n# - area_mean: the average area covered by the tumor cells\n# - concavity_mean: severity of concave portions of the contour\n# - texture_mean: standard deviation of gray-scale values\n#\n# For the three features described above(area_mean, concavity_mean, texture_mean), is there a difference in their average values in the two diagnosis groups? (Do samples of these features originate from the same distribution?)\n# - Select and apply the appropriate statistical test.\n# - Provide brief rationale for your selection of statistical test.\n# - Generate boxplots to compare experimental vs. control for t1, t2, and t3.\n#\n\n#%%\n\nimport seaborn as sns\nimport scipy.stats as stats\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')\n\ndf = pd.read_csv(\n \"https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/breast_cancer.csv\")\ndf = df[[\"diagnosis\", 'area_mean', 'concavity_mean', 'texture_mean']]\ndf.head()\n\n#%%\n\n# Check the distribution of classes for the the diagnosis column\nm = df[(df['diagnosis'] == 'M')]\nb = df[(df['diagnosis'] == 'B')]\n\nprint(stats.shapiro(m['area_mean']))\nprint(stats.shapiro(m['concavity_mean']))\nprint(stats.shapiro(m['texture_mean']))\nprint(stats.shapiro(b['area_mean']))\nprint(stats.shapiro(b['concavity_mean']))\nprint(stats.shapiro(b['texture_mean']))\n\n#%%\n\ndf.info()\n\n\n# We explore if the referenced features follow a normal distribution. We examine the skewness and kurtosis of the groups.\n#\n# Optional: We can run a Jarque\u2013Bera test which tests if the data have the skewness and kurtosis matching a normal distribution.\n# In the case of the Jarque\u2013Bera test the null hypothesis would state that the skewness and kurtosis matches that of a normal distribution.\n\n#%%\n\n# Print the skewness and kurtosis of the groups. Optionally run a Jarque\u2013Bera\n# Afterwards you can comment your findings wether the features follow a normal distribution of not.\nprint(stats.skew(m['area_mean']))\nprint(stats.skew(m['concavity_mean']))\nprint(stats.skew(m['texture_mean']))\n\nprint('\\n')\n\nprint(stats.kurtosis(m['area_mean']))\nprint(stats.kurtosis(m['concavity_mean']))\nprint(stats.kurtosis(m['texture_mean']))\n\n#%%\n\nmalignant = df[df.diagnosis == 'M'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\nbenign = df[df.diagnosis == 'B'][[\n 'area_mean', 'concavity_mean', 'texture_mean']]\n\n#%%\n\n# Plot the histograms for the distribution of each feature and for each class (malignant/benign) to visually explore\n# wether these distributions are skewed or not.\n\nplt.figure()\nplt.hist(m['area_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['concavity_mean'], alpha=0.5)\nplt.figure()\nplt.hist(m['texture_mean'], alpha=0.5)\nplt.show\n\n#%%\n\n# Run 3 Kruskal-Wallis tests, one for each group/feature and describe the results of these tests.\n# State wether the null hypothesis will be rejected or not.\nprint(stats.kruskal(m['area_mean'], b['area_mean']))\nprint(stats.kruskal(m['concavity_mean'], b['concavity_mean']))\nprint(stats.kruskal(m['texture_mean'], b['texture_mean']))\n\n#%%\n\n# Afterwards create 3 boxplots: one for each feature grouped by the diagnosis class.\nsns.boxplot(x='diagnosis', y='area_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='concavity_mean', data=df)\nplt.show()\nsns.boxplot(x='diagnosis', y='texture_mean', data=df)\nplt.show()\n\n\n# ### Exercise 2\n#\n# For the following test we will use the Real GDP per capita dataset for Europe available by Eurostat from the following address https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n# It contains the GDP per capita for each country (The indicator is calculated as the ratio of real GDP to the average population of a specific year.)\n# We will use the years 2007, 2008 and 2009 to check if the economic crisis of 2008 affected the GDP values of europe.\n#\n# Dataset:\n# - https://ec.europa.eu/eurostat/web/products-datasets/-/sdg_08_10\n#\n#\n# Questions:\n#\n# Is there a difference in the GDP per capita between the years (2007 and 2008), (2008 and 2009)?\n# - Select and apply the appropriate statistical test.\n#\n\n# Our dataset needs some cleaning at first. We extract only the columns of interest, clean numeric columns of unwanted characters and convert these columns to numeric.\n# We also remove rows that refer to collective index for many countries (\"EA19\", \"EU27\",\"EU28\") and keep individual countries.\n\n#%%\n\ngdp = pd.read_csv(\"https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/sdg_08_10.tsv.gz\",\n compression='gzip', sep=r'\\,|\\t', usecols=[\"unit\", \"geo\\\\time\", \"2007 \", \"2008 \", \"2009 \"], engine='python')\ngdp = gdp[gdp.unit == \"CLV10_EUR_HAB\"].drop([\"unit\"], axis=1)\ngdp.columns = [\"country\", \"2007\", \"2008\", \"2009\"]\ngdp = gdp[~gdp.country.isin([\"EA19\", \"EU27\", \"EU28\"])]\ngdp['2007'] = gdp['2007'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2008'] = gdp['2008'].str.extract('(\\d+)', expand=False).astype(int)\ngdp['2009'] = gdp['2009'].str.extract('(\\d+)', expand=False).astype(int)\ngdp.head()\n\n#%%\n\ngdp.describe()\n\n\n# Data that refer to economic indices usually do not follow a normal distribution, something we can easily observe by visualizing the data below, hence we should use non parametric tests.\n\n#%%", "original_comment": "# Plot the histogram for the values of each year.\n", "target_code": "plt.figure()\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "sns.histplot(x=\"2007\", data=gdp)\nplt.show()\n", "model": "docstring", "intent": "# Plot the histogram for the values of each year."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport scipy.stats\n\n#%%\n\ndf3 = pd.read_csv('income_cases_2.csv')\n\n\n# ### The following code displays the merged dataset datatypes.\n\n#%%\n\ndf3.dtypes\n\n\n# ### The following code displays statistics from the merged dataset.\n\n#%%\n\ndf3.describe()\n\n\n# ### The code in the following cells normalizes the data by population and displays the result.\n\n#%%\n\n#df3[[\"deaths\",\"Total_Population\"]] = scaler.fit_transform(df3[[\"deaths\",\"Total_Population\"]])\ndeath_norm_list = df3[\"deaths\"]/df3[\"Total_Population\"]\ndf3.describe()\n\n#%%\n\nlow_income_norm_list = df3[\"HH_income_less_35k\"]/df3[\"Total_Population\"]\nprint(low_income_norm_list)\n\n\n# ### The following code assigns the x-value for the plots.\n\n#%%\n\nx = low_income_norm_list\n\n\n# ### The following code assigns the y-value for the plots.\n\n#%%\n\ny = death_norm_list\n\n\n# ### The following code forces the two lists into arrays.\n\n#%%\n\nd_norm = np.array(death_norm_list)\nli_norm = np.array(low_income_norm_list)", "original_comment": "# ### The following code cells shape the new arrays.\n", "target_code": "np.shape(d_norm)\n", "project_metadata": {"full_name": "abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation", "description": "initial commit", "topics": [], "git_url": "git://github.com/abdulmohammed3/Covid-19_Disease_Transmission_and_Economic_Correlation.git", "stars": 4, "watchers": 4, "forks": 0, "created": "2020-11-06T17:59:31Z", "size": 13456, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5835338, "Python": 6376}, "last_updated": "2020-11-12T20:56:51Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Strongly agree", "precision-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print(d_norm.shape)\nprint(li_norm.shape)\n", "model": "natural", "intent": "# shape array"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Take a Random Sample of a Pandas Dataframe\n\n# ## Imports\n\n#%%\n\nimport pandas as pd\nimport numpy as np\n\n\n# ## Import the Data\n\n#%%\n\n# Location of the data file\ndata_file = \"/home/students/data/ontime/flights.csv\"\n\n#%%\n\n# Get the data\nflights_df = pd.read_csv(data_file)\n\n#%%\n\n# View the top five records\nflights_df.head(5)\n\n\n# ## Process the Data\n\n#%%\n\n# Determine the number of records in the sample\nnum_records = len(flights_df)\nsample_percentage = 0.2\nnum_records_in_sample = int(num_records * sample_percentage)\n\nprint(\"Total records: {}\".format(num_records))\nprint(\"Sample percentage: {}%\".format(sample_percentage * 100))\nprint(\"Records in sample: {}\".format(num_records_in_sample))\n\n#%%\n\n# Create a sample from the dataframe\nsample_df = flights_df.sample(num_records_in_sample)\n\n#%%", "original_comment": "# Show the top 10 rows of the sample\n", "target_code": "sample_df.head()\n", "project_metadata": {"full_name": "rdempsey/data-analytics-machine-learning-big-data", "description": "Slides, code and more for my class: Data Analytics and Machine Learning on Big Data", "topics": ["big-data", "machine-learning", "jupyter-notebook", "graphviz", "data-exploration", "python", "pyspark", "mllib"], "git_url": "git://github.com/rdempsey/data-analytics-machine-learning-big-data.git", "stars": 6, "watchers": 6, "forks": 18, "created": "2017-11-13T17:50:29Z", "size": 132919, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 465300, "Shell": 4686, "Python": 692}, "last_updated": "2020-10-12T03:12:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "flights_df.head(10)\n", "model": "docstring", "intent": "# Show the top 10 rows of the sample"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n\ntype(my_decimal)\n\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # August 21\n\n# Today, we will be orienting ourselves to the Jupyter Notebook, learning about Python, and looking at our first data sets.\n\n# ## The Jupyter Notebook\n#\n# The interface in front of you is called a Jupyter Notebook. This cell that you're reading now is a _Markdown cell_. These are used to hold _text_ information. I will use them to communicate with you throughout the course. We can even embed images in the Markdown. If you double-click in this cell, you will see that this is plain text. The plain text characters are transformed into attractive text by the Jupyter Notebook.\n#\n\n#%%\n\n# This is a code cell.\n\nimport pandas as pd\nimport random\n\n# It holds code.\n\nlist = random.randint(1, 10)\n\n# It can also hold text, in the form of comments. Comments are helpful hints to yourself. Jupyter will not execute\n# the comments. Leave as many as you want!\n\nprint(list)\n\n\n# When a cell is \"run\", by pressing the run button, it will be executed in a manner that is appropriate for the type of cell it is. If it is a code cell, the code will be run. If it is a markdown cell, the text will be rendered.\n\n# # What is Python?\n\n# Python is an open-source, free to use programming language. *Open Source* is desireable because we can look at any of the given functions in Python, and understand how they work. *Free*, we all understand why that is good.\n#\n# Python is actively maintained by the Python Software Foundation, and is rapidly becoming one of the world's most commonly-used languages.\n#\n# ![Python Popularity](img/pythondominance.png)\n#\n# You find Python in virtually all fields, and all career paths.\n#\n# Python is also easy to read. Without knowing any Python, look at the below cell. See if you can figure out what it will do, then run it to see if you're right.\n\n#%%\n\nnum_list = [1, 2, 3, 4]\nnew_list = []\n\nfor entry in num_list:\n new_list.append(entry*2)\n\nprint(new_list)\n\n\n# Were you able to guess correctly? Python is written in such a way that it mimics human speech and writing.\n#\n# Python also has an active user community who communicate different packages and workflows to the software. For example, I use the Python library DendroPy almost daily in my work. It is for working with phylogenetic data in Python.\n\n# # Ask for help when you need it\n# # I'm not joking around\n# # This class is a little different than others, in that we don't have many throwaway moments when you learn a fact, use it on a test, then maybe never use it again\n# # If you don't get it now, it might be a problem later, and we'll work on it. Now.\n#\n# Seriously, y'all, just ask. We'll get it worked out.\n\n# # Data types in Python\n#\n# In the first couple weeks, we will be working with datasets in an interactive way. But first, we should learn a little bit about how Python works. One of the most common operations to do programmatically is save data to a variable. A _variable_ is a little bit of space we clear in the computer's memory. We can fill it with information, and give it a handle to recall it later. See below.\n\n#%%\n\nmy_text = \"This is a string variable\"\n# Strings are varaibles that are meant to be read literally as they are seen above. Often, they are text.\n# You know a string because it will be encased in quotation marks\n# Enter the name of the variable to view it.\nmy_text\n\n#%%\n\nmy_number = 64\n# This is an integer value\nmy_decimal = 1.64\n# This number has a decimal\n\n\n# The kind of variable you create dictates some of the things you can do with it. Do you think my_number and my_integer are the same kind of variable? Run the below code to find out.\n\n#%%\n\ntype(my_decimal)\n\n#%%\n\ntype(my_number)\n\n\n# \"Float\" - what does that mean? Floats are stored differently in the computer's memory than integers are, and saving whole numbers as integers can mean programs take less memory to run.\n#\n# The \"type\" refers to the kind of variable something is. This can influence what operations you can do with that variable. For example:\n\n#%%\n\nround(my_decimal)\n\n\n# What does round() do? What does it do if you call it on `my_number`?\n#\n# `round()` is a function. So is `type()`. We can think of functions like organs in our body - they are sets of code that work together to accomplish some task. You can recognize that you are calling a function by the presence of the open and close parentheses. Functions have help available via the help function.\n\n#%%\n\nhelp(round)\n\n\n# There are more data types out there, but we'll start with these. Most of the data we will work with in this course will be of these three types - integers, floats and strings.\n\n# # Operators in Python\n#\n# Python uses what are likely to be familiar operators: `+, -, /, *, %`\n#\n# Try using each of these operators to combine `my_number` and `my_decimal`. What behaviors make sense? Which are hard to understand? To try using operators, first make a new code cell, by clicking the `+` button above. Then, enter the comparison you would like to make.\n#\n#\n\n# We can also use what are called logical operators. These operators, `<, >, ==, !=, <=, >=` evaluate objects relative to one another. Once again, create a new cell and try each operator to compare `my_number` and `my_decimal`. What is each one doing?\n\n# # Groups of Objects\n#\n# ## Lists\n#\n# How often do you want to sit down and hand-enter data? Basically never. For the purpose of storing more massive sets of objects, we have lists. Lists are _ordered_, meaning that they are stored in the same order in the computer's memory as when you enter them.\n\n#%%\n\nmy_number_list = [1, 2, 3, 4, 5]\n\nmy_number_list[2]\n\n\n# Did you note something odd, there? What happens if you try to access the first element of the `my_number_list`?\n#", "original_comment": "# Lists can also be added to:\n", "target_code": "my_number_list.append(6)\n", "project_metadata": {"full_name": "wrightaprilm/CompBio2018", "description": null, "topics": [], "git_url": "git://github.com/wrightaprilm/CompBio2018.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2018-08-14T16:08:48Z", "size": 8976, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4697133, "TeX": 4884, "Python": 4399}, "last_updated": "2019-06-27T20:53:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "my_number_list.append(6)\nmy_number_list\n", "model": "docstring", "intent": "# add element to list"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport pytz\nfrom datetime import datetime, timedelta\nfrom sklearn.preprocessing import StandardScaler\nfrom shapely import wkb, wkt\nimport geopandas\nimport pandas\nimport numpy\nimport covid19_userLocal as covid19\nfrom ibmpairs import paw\n\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\n# IBM PAIRS open-source module\n\n#%%\n\n# For Both Cases and Growth we use the same ROLLING_WINDOW\nROLLING_WINDOW = 14\nlag_growthCases = 19\nlag_growthCasesStd = 4\n\nlag_Feature = numpy.arange(\n lag_growthCases-lag_growthCasesStd, lag_growthCases+lag_growthCasesStd+1, 1)\nprint('lag_Feature', lag_Feature)\n\n#dt_cutoff_training_COVID = datetime(2020,5,31, tzinfo=pytz.utc)\ndt_cutoff_training_COVID = datetime(2020, 7, 25, tzinfo=pytz.utc)\nprint('Training only with COVID growth data until ', dt_cutoff_training_COVID)\ndt_cutoff_training_mobility = dt_cutoff_training_COVID - \\\n timedelta(days=lag_growthCases - lag_growthCasesStd)\nprint('Training only with Mobility data until ', dt_cutoff_training_mobility)\ndt_cutoff_min = datetime(2020, 3, 1, tzinfo=pytz.utc)\nprint('Considering data from ', dt_cutoff_min)\ndt_cutoff_latest = datetime(2020, 8, 2, tzinfo=pytz.utc)\nprint('Plotting data up to ', dt_cutoff_latest)\n\ndata_subdirectory = 'data/csv/run98FullTrainingJHU'\nif not os.path.exists(data_subdirectory):\n os.makedirs(data_subdirectory)\nprint('data_subdirectory ', data_subdirectory)\n\n#%%\n\n# One-time calculation\n\"\"\"\n# Local Polygons\ndf_region = pandas.read_csv('data/local_polygons.csv', usecols=['id', 'name', 'poly'])\n\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = df_region.rename(columns={'id': 'pairs_id'})\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\n# We need County and State columns later on\nnew = df_region['name'].str.split('.', expand=True)\ndf_region['County'] = new[1]\ndf_region['State'] = new[0]\n\n# Write it out without the wkb.loads\ndf_region_csv = df_region.copy()\ndel df_region_csv['poly']\ndf_tmp = pandas.read_csv('data/local_polygons.csv')[['id', 'poly']]\ndf_tmp = df_tmp.rename(columns={'id': 'pairs_id'})\ndf_region_csv = pandas.merge(df_region_csv, df_tmp, on='pairs_id', how='left')\ndf_region_csv.to_csv('data/df_region.csv', index=None)\n\ndf_region.tail()\n\"\"\"\n\n#%%\n\n# Get the region data (county ids, names, and polygons)\ndf_region = pandas.read_csv('data/df_region.csv')\n# Make a copy in the specific subfolder\ndf_region.to_csv(os.path.join(data_subdirectory, 'df_region.csv'), index=None)\n# Read back\ndf_region = pandas.read_csv(os.path.join(data_subdirectory, 'df_region.csv'))\ndf_region['poly'] = df_region['poly'].apply(lambda x: wkb.loads(x, hex=True))\ndf_region = geopandas.GeoDataFrame(df_region, geometry='poly')\n\ndf_region.tail()\n\n#%%\n\n# Query Local COVID-19 Cases\ncoronaQueryLocal = covid19.query_local(layerID='P567C6007') # JHU\ndf_local_covid = coronaQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_covid = df_local_covid.rename(columns={'Value': 'Cases'})\ndf_local_covid['pairs_id'] = df_local_covid['pairs_id'].astype(int)\ndf_local_covid = df_local_covid[df_local_covid['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\ndf_local_covid = pandas.merge(\n df_local_covid, df_region[['pairs_id']], on='pairs_id').reset_index()\n\n# Make a copy in the specific subfolder\ndf_local_covid.to_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'), index=None)\n\ndf_local_covid.tail()\n\n#%%\n\n# Read back covid data from csv (raw cumulative cases)\ndf_local_covid = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_covid.csv'))\ndf_local_covid['timestamp'] = pandas.to_datetime(df_local_covid['timestamp'])\ndf_local_covid.tail()\n\n#%%\n\n# Query Local Mobility (Descartes lab median of max mobility)\nmobilityQueryLocal = covid19.query_local(layerID='P612C6303')\ndf_local_mobility = mobilityQueryLocal.vdf[[\n 'timestamp', 'pairs_id', 'State', 'County', 'Value']]\ndf_local_mobility = df_local_mobility.rename(columns={'Value': 'Mobility'})\ndf_local_mobility['pairs_id'] = df_local_mobility['pairs_id'].astype(int)\ndf_local_mobility = df_local_mobility[df_local_mobility['timestamp']\n <= dt_cutoff_latest].reset_index(drop=True)\n\n# Make a copy in the specific subfolder\ndf_local_mobility.to_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'), index=None)\n\ndf_local_mobility.tail()\n\n#%%\n\n# Read back mobility data (Descartes lab median of max mobility)\ndf_local_mobility = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_local_mobility.csv'))\ndf_local_mobility['timestamp'] = pandas.to_datetime(\n df_local_mobility['timestamp'])\ndf_local_mobility.tail()\n\n#%%\n\n# Unstacking COVID19 and mobility\n\n# Unstack the COVID19 data and first derivative (new cases)\ndf_unstacked = df_local_covid.copy()\ndel df_unstacked['State']\ndel df_unstacked['County']\n\ndf_unstacked = df_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_unstacked = df_unstacked.swaplevel(axis=1)\ndf_unstacked = df_unstacked.replace(0, numpy.nan)\n\n# Replace values where no change with nan so that the daily numbers make sense when reporting only every couple of days\n# (also replace values with negative change)\ndf_unstacked[df_unstacked.diff() <= 0] = numpy.nan\n\n# Interpolate\ndf_unstacked = df_unstacked.interpolate(method='linear', limit_area='inside')\n\n# New Local Cases (1st derivative)\ndf_new = df_unstacked.diff()\n\n\n# Unstack the mobility data\ndf_m_unstacked = df_local_mobility.copy()\ndel df_m_unstacked['State']\ndel df_m_unstacked['County']\n\ndf_m_unstacked = df_m_unstacked.set_index(['timestamp', 'pairs_id']).unstack(\n).reset_index().sort_values(by='timestamp').set_index('timestamp')\ndf_m_unstacked = df_m_unstacked.swaplevel(axis=1)\n\n# Erase high-value mobility outliers >100miles before taking the rolling mean\ndf_m_unstacked = df_m_unstacked.clip(upper=100)\n\n# Interpolate\ndf_m_unstacked = df_m_unstacked.interpolate(\n method='linear', limit_area='inside')\n\ndf_m_unstacked.tail()\n\n\n# Stack and merge in order to fill in nan at all missing combinations\ndf_stacked = pandas.merge(df_new.stack(level='pairs_id').reset_index(),\n df_m_unstacked.stack(level='pairs_id').reset_index(),\n on=['timestamp', 'pairs_id'],\n how='outer'\n )\ndf_stacked['pairs_id'] = df_stacked['pairs_id'].astype(int)\n\n# Unstack again\ndf_stacked = df_stacked.set_index(['timestamp', 'pairs_id']).unstack().reset_index().sort_values(\n by='timestamp').set_index('timestamp')\n\ndf_new = df_stacked[['Cases']].swaplevel(axis=1)\ndf_m_unstacked = df_stacked[['Mobility']].swaplevel(axis=1)\n\ndf_new.tail()\n\n#%%\n\n# FIPS codes to pairs_id\ndf_fips = pandas.read_csv('data/County_PAIRS_FIPS.csv',\n dtype={'FIPS': 'string'})\n\n# Make a copy in the specific subfolder\ndf_fips.to_csv(os.path.join(data_subdirectory,\n 'County_PAIRS_FIPS.csv'), index=None)\n\n# Read back translation FIPS codes to pairs_id\ndf_fips = pandas.read_csv(os.path.join(\n data_subdirectory, 'County_PAIRS_FIPS.csv'), dtype={'FIPS': 'string'})\ndf_fips.tail()\n\n#%%\n\n# Census data\ndf_census = pandas.read_csv('data/cc-est2019-alldata.csv', dtype={'STATE': 'string',\n 'COUNTY': 'string'})\ndf_census['FIPS'] = df_census['STATE'] + df_census['COUNTY']\n\n# Use only latest (2019 estimate)\ndf_census = df_census[df_census['YEAR'] == 12]\ndf_census.tail()\n\n#%%\n\n# Absolute population numbers\n\ndf_population = df_census[df_census['AGEGRP'] == 0][[\n 'FIPS', 'TOT_POP']].reset_index(drop=True) # Total population\ndf_population = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_population, on='FIPS').drop(columns='FIPS')\ndf_population = df_population.rename(columns={'TOT_POP': 'population'})\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby sum pairs_id\ndf_population = df_population.groupby(\n 'pairs_id').sum().reset_index().sort_values(by='pairs_id')\ndf_population.tail()\n\n# Make a copy in the specific subfolder\ndf_population.to_csv(os.path.join(\n data_subdirectory, 'df_population.csv'), index=None)\n\n# Read back from disk\ndf_population = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_population.csv'))\n\ndf_population.tail()\n\n#%%\n\n# Calculate the population density (one-time calculation)\n\"\"\"\nimport shapely.ops as ops\nimport pyproj\nfrom functools import partial\n\ndef geom_area(geom):\n # Calculate area for lat-lon polygon in km2\n geom_transformed = ops.transform(\n partial(\n pyproj.transform,\n pyproj.Proj(init='EPSG:4326'),\n pyproj.Proj(\n proj='aea',\n lat_1=geom.bounds[1],\n lat_2=geom.bounds[3])),\n geom)\n return geom_transformed.area / 1e6\n\ndf_pop_density = pandas.merge(df_population, df_region[['pairs_id', 'poly']], on='pairs_id')\n#df_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: x.area)\ndf_pop_density['population_density'] = df_pop_density['population'] / df_pop_density['poly'].apply(lambda x: geom_area(x))\ndel df_pop_density['poly']\ndel df_pop_density['population']\n\n# Write to disk\ndf_pop_density.to_csv('data/df_pop_density.csv', index=False)\n\"\"\"\n\n#%%\n\n# Read population density from disk\ndf_pop_density = pandas.read_csv('data/df_pop_density.csv')\n\n# Make a copy in the specific subfolder\ndf_pop_density.to_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'), index=None)\n\n# Read back from disk\ndf_pop_density = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_pop_density.csv'))\ndf_pop_density.tail()\n\n#%%\n\n# Age-related census population data\n\n# AGEGRP\n# 0 = Total\n# 1 = Age 0 to 4 years\n# 2 = Age 5 to 9 years\n# 3 = Age 10 to 14 years\n# 4 = Age 15 to 19 years\n# 5 = Age 20 to 24 years\n# 6 = Age 25 to 29 years\n# 7 = Age 30 to 34 years\n# 8 = Age 35 to 39 years\n# 9 = Age 40 to 44 years\n# 10 = Age 45 to 49 years\n# 11 = Age 50 to 54 years\n# 12 = Age 55 to 59 years\n# 13 = Age 60 to 64 years\n# 14 = Age 65 to 69 years\n# 15 = Age 70 to 74 years\n# 16 = Age 75 to 79 years\n# 17 = Age 80 to 84 years\n# 18 = Age 85 years or older\n\ndf_age_20_24 = df_census[df_census['AGEGRP'] ==\n 5].reset_index(drop=True) # Age 20 to 24 years\ndf_age_60_64 = df_census[df_census['AGEGRP'] ==\n 13].reset_index(drop=True) # Age 60 to 64 years\ndf_AgeRatio = df_age_60_64[['FIPS', 'TOT_POP']].set_index(\n 'FIPS') / df_age_20_24[['FIPS', 'TOT_POP']].set_index('FIPS')\ndf_AgeRatio = df_AgeRatio.rename(columns={'TOT_POP': 'AgeRatio'}).reset_index()\n\n# Clip outliers\ndf_AgeRatio['AgeRatio'] = df_AgeRatio['AgeRatio'].clip(lower=0.1, upper=20)\ndf_AgeRatio['LogAgeRatio'] = numpy.log10(df_AgeRatio['AgeRatio'])\n\ndf_AgeRatio = pandas.merge(\n df_fips[['pairs_id', 'FIPS']], df_AgeRatio, on='FIPS').drop(columns='FIPS')\n# Even after removing duplicates there are two FIPS in Alaska pointing to the same pairs_id so groupby mean pairs_id\ndf_AgeRatio = df_AgeRatio.groupby(\n 'pairs_id').mean().reset_index().sort_values(by='pairs_id')\n\n# Make a copy in the specific subfolder\ndf_AgeRatio.to_csv(os.path.join(data_subdirectory,\n 'df_AgeRatio.csv'), index=None)\n\n# Read back from disk\ndf_AgeRatio = pandas.read_csv(os.path.join(\n data_subdirectory, 'df_AgeRatio.csv'))\n\ndf_AgeRatio.tail()\n\n#%%\n\ndef process_covid_data(df_new, df_population, rolling_window_log, rolling_window_growth):\n \"\"\"\n :df_new: DataFrame with the new cases and fatalities\n\n Returns: df_log_new (Semi-log cleaned up daily cases and fatalities)\n Returns: df_log_new_rolling (Semi-log cleaned up daily data with 5 day rolling mean)\n Returns: df_growth (Growth in daily cases and fatalities)\n Returns: df_new_rolling_scaled (Daily cases and fatalities cleand up, rolling mean, normalized by 100K population)\n \"\"\"\n # Build a semi-log version of the data and clean up\n df_log_new = df_new.copy()\n df_log_new[df_log_new <= 0] = numpy.nan\n df_log_new = df_log_new.apply(lambda x: numpy.log(x))\n df_log_new = df_log_new.replace([numpy.inf, -numpy.inf], numpy.nan)\n\n # Remove outliers (non-symmetrical so that we don't erase too many valid high-value outliers)\n df_log_new[(df_log_new < df_log_new.rolling(3, center=True).mean() - 0.8) |\n ((df_log_new > df_log_new.rolling(3, center=True).mean() + 1.2) & (df_log_new > 4))] = numpy.nan\n\n # Interpolate to fill in missing values\n df_log_new = df_log_new.interpolate(method='linear', limit_area='inside')\n\n # Rolling Mean\n df_log_new_rolling = df_log_new.rolling(\n rolling_window_log, min_periods=1).mean()\n\n # Growth rate (don't use min_periods=1 because it generates too many outliers)\n df_growth = df_log_new_rolling.diff()\n\n # Mask bad growth values due to derivatives of small numbers\n SMALL_VALUE = -1\n df_growth[df_log_new <= SMALL_VALUE] = numpy.nan\n\n # Interpolate inside to fill in missing values\n df_growth = df_growth.interpolate(method='linear', limit_area='inside')\n\n # Filling outside nans with zero\n df_growth = df_growth.fillna(0)\n\n # Apply rolling mean for growth\n df_growth = df_growth.rolling(rolling_window_growth, min_periods=1).mean()\n\n # Scale by 100K population\n df_new_rolling_scaled = numpy.exp(df_log_new_rolling)\n for pairs_id in df_log_new_rolling.columns.get_level_values('pairs_id').unique():\n df_new_rolling_scaled[pairs_id] = df_new_rolling_scaled[pairs_id] * 100000. / \\\n df_population[df_population['pairs_id']\n == pairs_id]['population'].values[0]\n\n return df_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled\n\n#%%\n\n# Process the Covid data\ndf_log_new, df_log_new_rolling, df_growth, df_new_rolling_scaled = process_covid_data(df_new.swaplevel(axis=1)[['Cases']].swaplevel(axis=1),\n df_population,\n rolling_window_log=ROLLING_WINDOW,\n rolling_window_growth=ROLLING_WINDOW)\n\ndf_log_new_rolling.tail()\n\n#%%\n\n# Mobility rolling mean\ndf_Mobility_rolling = df_m_unstacked.rolling(\n ROLLING_WINDOW, min_periods=1).mean()\n\n# LogMobility rolling mean\ndf_LogMobility_rolling = df_Mobility_rolling.swaplevel(axis=1).rename(\n columns={'Mobility': 'LogMobility'}).swaplevel(axis=1).apply(lambda x: numpy.log10(x))\n\ndf_LogMobility_rolling.tail()\n\n#%%\n\n# Filter out data before dt_cutoff_min. Corona numbers are too low.\n# We are doing this here AFTER the rolling means have been applied\ndel df_new\ndel df_log_new\ndf_LogCases = df_log_new_rolling[df_log_new_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_log_new_rolling\ndf_GrowthCases = df_growth[df_growth.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_growth\ndf_CasesCapita = df_new_rolling_scaled[df_new_rolling_scaled.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Cases']\ndel df_new_rolling_scaled\ndf_Mobility = df_Mobility_rolling[df_Mobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'Mobility']\ndel df_Mobility_rolling\ndf_LogMobility = df_LogMobility_rolling[df_LogMobility_rolling.index >= dt_cutoff_min].swaplevel(axis=1)[\n 'LogMobility']\ndel df_LogMobility_rolling\n\n#%%\n\n# Make a copy in the specific subfolder\ndf_LogCases.to_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_GrowthCases.to_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\n\n# Make a copy in the specific subfolder\ndf_CasesCapita.to_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\n\n# Make a copy in the specific subfolder\ndf_Mobility.to_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))", "original_comment": "# Make a copy in the specific subfolder\n", "target_code": "df_LogMobility.to_csv(os.path.join(data_subdirectory, 'df_LogMobility.csv'))\n", "project_metadata": {"full_name": "IBM/ibmpairs", "description": "open source tools for interaction with IBM PAIRS:", "topics": ["ibm-research", "ibm-pairs-geoscope", "geospatial", "geospatial-analytics", "big-data", "big-data-analytics", "restful-api-wrapper", "gis-utils"], "git_url": "git://github.com/IBM/ibmpairs.git", "stars": 11, "watchers": 11, "forks": 11, "created": "2019-05-01T14:17:22Z", "size": 22983, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 570094, "Python": 202865, "Shell": 2742, "Dockerfile": 1938}, "last_updated": "2020-11-18T02:13:35Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "df_LogCases = pd.read_csv(os.path.join(data_subdirectory, 'df_LogCases.csv'))\ndf_GrowthCases = pd.read_csv(os.path.join(data_subdirectory, 'df_GrowthCases.csv'))\ndf_CasesCapita = pd.read_csv(os.path.join(data_subdirectory, 'df_CasesCapita.csv'))\ndf_Mobility = pd.read_csv(os.path.join(data_subdirectory, 'df_Mobility.csv'))\n", "model": "natural", "intent": "# Make a copy in the specific subfolder"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n\ndf['source'].value_counts()\n\n\ndf.groupby(['source', 'partition']).size()\n\n\ndf['digit'].value_counts()\n\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n\ntarget_shape = (32, 32)\n\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n\nMODEL_DIR\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n\nwandb.init()\n\n\n# ### Run Training\n\n\nMODEL_DIR\n\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n#%%\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n#%%\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n#%%\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n#%%\n\ndf['source'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'partition']).size()\n\n#%%\n\ndf['digit'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n#%%\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n#%%\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n#%%\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n#%%\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n#%%\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n#%%\n\ntarget_shape = (32, 32)\n\n#%%\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n#%%\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n#%%\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n#%%\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n#%%\n\nwandb.init()\n\n\n# ### Run Training\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n#%%\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n#%%\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)", "original_comment": "# Initialize from pre-trained model\n", "target_code": "model.load_weights(MODEL_PATH_HEAD)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model.load_weights('/lab/repos/svhn/weights.hdf5')\n", "model": "no-comments", "intent": "# Initialize from pre-trained model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\n\n\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n\ncorpus\n\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n\npred\n\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

Sentiment Analysis

\n#\n\n#%%\n\n# load the small embedding file\nimport os\nimport string\nimport nltk\nfrom functools import reduce # python 3\nimport csv\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nfrom nltk.stem.wordnet import WordNetLemmatizer\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport numpy as np\nfrom sklearn.metrics import precision_recall_fscore_support, accuracy_score\nfrom afinn import Afinn\nimport codecs\nimport pandas as pd\nfrom nltk.corpus import stopwords\nimport gensim\nsmall_model = gensim.models.KeyedVectors.load_word2vec_format(\n '/Users/Ashrakat/Desktop/small-embeddings.txt', binary=False)\n\n#%%\n\n# general pipeline + embedd\n\n\nexclude = set(string.punctuation)\nstop_word_list = stopwords.words('english')\n\n# input should be a string, you convert text in a doc-embedding\n\n\ndef text_embedding(text):\n\n # it depends if the words are lowercased or not in the word embeddings that you use, if they are not skip this step\n text = text.lower()\n\n text = nltk.word_tokenize(text)\n\n # remove numbers\n text = [token for token in text if token not in exclude and token.isalpha()]\n\n # remove stopwords (not essential)\n text = [token for token in text if token not in stop_word_list]\n\n article_embedd = []\n\n # you take all embeddings\n for word in text:\n try:\n embed_word = small_model[word]\n article_embedd.append(embed_word)\n except KeyError:\n continue\n\n # then you average them\n avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]\n\n return avg\n\n#%%\n\n# if we want to take a look using pandas - just for visualization\nsentiment = pd.read_csv(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", sep=\"\\t\", header=None)\nsentiment = sentiment[1:]\nsentiment.head()\n\n#%%\n\nsentiment.to_csv('/Users/Ashrakat/Downloads/yelp-test.tsv',\n index=False, sep=\"\\t\")\n\n#%%\n\n# open YELP product reviews dataset\n# we are using only the \"small\" test-set, you can also train on the large training set if you'd like\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\n\n\n#%%\n\n# first, we define two folders, \"corpus\" - with the text and \"labels\", with the labels\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values\n\n corpus.append(text)\n labels.append(label)\n\n#%%\n\ncorpus\n\n#%%\n\nlabels\n\n\n# # Sentiment Analysis using Word list based approaches\n\n# \"One of the simplest sentiment analysis approaches:\n# - compares the words of a text against a labeled word list\n# - where each word has been scored for valence, \u2014 **a \u201csentiment lexicon\u201d** \"\n#\n# Check Paper by Finn \u02daArup Nielsen: http://www2.imm.dtu.dk/pubdb/edoc/imm6006.pdf\n\n#%%\n\n# AFINN Dictionary for Sentiment Analysis: https://github.com/fnielsen/afinn\n# https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-111.txt\n\n#!pip install afinn\n\n\nafinn = Afinn()\n\nprint(afinn.score(\"This is bad fake news\"))\n\nprint(afinn.score(\"The sun is shining, what a beautiful day\"))\n\nprint(afinn.score(\"That movie is horrible and beautiful at the same time\"))\n\n#%%\n\npred = []\n\n# for sentence in the corpus predict its scores\nfor review in corpus:\n score = afinn.score(review)\n\n # I only want two labels for each of my sentences\n # if the score is below 0 give me a value -1, and if over 0 give me the value +1\n if score < 0.0:\n pred.append(\"-1\")\n else:\n pred.append(\"1\")\n\n#%%\n\npred\n\n#%%\n\nprint(precision_recall_fscore_support(labels, pred, average=\"macro\"))\n\n\n# # Sentiment Analysis as a Classification Problem\n#\n#\n\n#%%\n\nsentiment_dataset = codecs.open(\n \"/Users/Ashrakat/Downloads/yelp-test.tsv\", \"r\", \"utf-8\").read().strip().split(\"\\n\")\n\nprint(sentiment_dataset[0])\nprint(\" \")\nprint(sentiment_dataset[1])\n\n#%%\n\ncorpus = []\nlabels = []\n\n# be careful with this, the dataset is huge!\n# for line in sentiment_dataset:\nfor line in sentiment_dataset[1:1000]:\n\n # its a tab seperated file\n # remove the - replace with nothing\n text = line.split(\"\\t\")[1].replace('\"', '')\n label = line.split(\"\\t\")[0].replace('\"', '').replace(\n \"1\", \"-1\").replace(\"2\", \"1\") # change values", "original_comment": " # as usual, we use text-embeddings\n", "target_code": " text = text_embedding(text)\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "model = word2vec.Text8Corpus(\"/Users/Ashrakat/Downloads/yelp-test.tsv\")\n", "model": "docstring", "intent": " # use text-embeddings"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata[data.columns[data.dtypes == int]]\n\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata[data.columns[data.dtypes == int]]\n\n#%%\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n#%%\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)", "original_comment": "# ### Compute the explained variance for new data set.\n", "target_code": "pca.get_covariance()\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.scatter(two[:, 0], two[:, 1])\nplt.show()\n", "model": "no-comments", "intent": "# Compute the explained variance for new data set."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # AI for Earth System Science Hackathon 2020\n# # Microphysics Machine Learning Challenge Problem\n#\n# Andrew Gettelman, Jack Chen, David John Gagne\n#\n# ## Introduction\n# Cloud processes are perhaps the most critical and uncertain processes for weather and climate prediction. The complex nature of sub grid scale clouds makes traceable simulation of clouds across scales difficult (or impossible). There exist many observations and detailed simulations of clouds that are used to develop and evaluate larger scale models. Many times these models and measurements are used to develop empirical relationships for large scale models to be computationally efficient. Machine learning provides another potential tool to improve our empirical parameterizations of clouds. Here we present a comprehensive investigation of replacing the warm rain formation process in an earth system model with emulators that use detailed treatments from small scale and idealized models to represent key cloud microphysical processes.\n#\n# The warm rain formation process is critical for weather and climate prediction. When rain forms governs the location, intensity and duration of rainfall events, critical for weather and the hydrologic cycle. Rain formation also affects cloud lifetime and the radiative properties of low clouds, making it critical for predicting climate (twomey1977,albrecht1989) The specific process of rain formation is altered by the microphysical properties of clouds, making rain formation susceptible to the size distribution of cloud drops, and ultimately to the distribution of aerosol particles that act as Cloud Condensation Nuclei.\n#\n# Ice of course will complicate the precipitation process. Supercooled liquid drops can exist, and these will either precipitation in a similar manner to warm precipitation (with no ice involved) and subsequently may freeze once they are rain drops. Or cloud droplets may freeze and form ice crystals, which precipitate and collect liquid, freezing or riming as they fall. We will not concern ourselves in this work with processes involving (or potentially involving) ice. This of course is a critical issue for weather (forbes2014)and climate (gettelman2019b,bodas-salcedo2019)prediction.\n#\n# The representation of rain formation in clouds involves the interaction of a population of hydrometeors. For warm clouds, the process is one of collision and coalescence, usually defined with a detailed process of stochastic collection (pruppacher1997). The stochastic collection process describes how each size particle interacts with other sizes. Usually there is a distribution of small cloud drops with an extension or separate distribution of rain drops whose interactions are evaluated.\n#\n# The stochastic collection process is computationally expensive to treat directly in large scale global models for weather and climate prediction. It requires the pre-computation of a collection kernel for how different sizes of hydrometeors will interact due to differential fall speeds, and it requires tracking populations discretized by bins. This tracking and advection of the order of 60 different bins for liquid and ice combined makes it computationally expensive. So traditionally, large scale models with bulk microphysics treat the stochastic collection process of warm rain formation in a heavily parameterized fashion (khairoutdinov2000,seifert200) For conceptual simplicity, the process is often broken up into two processes. Autoconversion is the transition of cloud drops into rain as part of a cloud droplet distribution grows to large sizes. Methods for determining autoconversion and accretion are varied. Because they are the major loss mechanism for cloud water different descriptions of the processes result in very different model evolution and climates (michibata2015).\n#\n# Because many methods for autoconversion and accretion are just empirical fits to data or other models, they are readily applicable to replacement with more sophisticated tools. Neural Networks are multivariate emulators that allow many more degrees of freedom than traditional polynomial methods for example.\n#\n\n# ## Software Requirements\n# This notebook requires Python >= 3.7. The following libraries are required:\n# * numpy\n# * scipy\n# * pandas\n# * matplotlib\n# * xarray\n# * scikit-learn\n# * tensorflow >= 2.1\n# * netcdf4\n# * h5netcdf\n# * tqdm\n# * pyyaml\n# * s3fs\n# * pyarrow\n\n#%%\n\nfrom tensorflow.keras.layers import GRU, LSTM\nfrom tensorflow.keras import layers\nfrom tensorflow import keras\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.preprocessing import power_transform\nfrom mlmicrophysics.data import log10_transform, categorize_output_values\nfrom mlmicrophysics.data import subset_data_files_by_date, assemble_data_files\nfrom mlmicrophysics.models import DenseNeuralNetwork\nfrom mlmicrophysics.metrics import heidke_skill_score, peirce_skill_score, hellinger_distance, root_mean_squared_error, r2_corr\nimport tensorflow as tf\nfrom sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error\nfrom sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport s3fs\nimport os\nfrom os.path import join, exists\nimport yaml\nimport sys\nimport random\nimport argparse\nget_ipython().system('pip install numpy scipy pandas matplotlib xarray scikit-learn tensorflow netcdf4 h5netcdf tqdm pyyaml s3fs pyarrow mlmicrophysics')\n\n#%%\n\nget_ipython().system(' pip install --upgrade pandas')\n\n\n# # if working on google colab\n# ! pip install -U -q PyDrive\n# from google.colab import drive\n# drive.mount('/content/gdrive')\n\n# ## Data\n#\n# The Community Atmosphere Model version 6 (CAM6) is the atmospheric component of the Community Earth System Model version 2 (danabasoglu2020). CAM6 features a two-moment stratiform cloud microphysics scheme [hereafter MG2](gettelman2015b,gettelman2015a) with prognostic liquid, ice, rain and snow hydrometeor classes. MG2 permits ice supersaturation. CAM6 includes a physically based ice mixed phase dust ice nucleation scheme (hoose2010) with modifications for a distribution of contact angles (wang2014), and accounts for preexisting ice in the cirrus ice nucleation of (liu2005) as described by (shi2015).\n#\n# MG2 is coupled to a unified moist turbulence scheme, Cloud Layers Unified by Binormals (CLUBB), developed by (golaz2002) and (larson2002) and implemented in CAM by (bogenschutz2013). CLUBB handles stratiform clouds, boundary layer moist turbulence and shallow convective motions. CAM6 also has an ensemble plume mass flux deep convection scheme described by (zhang1995) and (neale2008), which has very simple microphysics. The radiation scheme is The Rapid Radiative Transfer Model for General Circulation Models (RRTMG) (iacono2000).\n#\n# Within the MG2 parameterization, the warm rain formation process is represented by equations for autoconversion and accretion from (khairoutdinov2000), hereafter KK2000. KK2000 uses empirical fits to a large eddy simulation with bin-resolved microphysics to define:\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{AUTO} = 13.5 q_c^{2.47} N_c^{-1.1}\n# \\end{equation}\n# \\begin{equation}\n# \\left(\\frac{\\partial q_r}{\\partial t} \\right)_{ACCRE} = 67 (q_c q_r)^{1.15}\n# \\end{equation}\n# Where $q_c$ and $q_r$ are mass mixing ratios for condensate and rain, and $N_c$ is the number concentration of condensate. For CAM6 the autconversion rate exponent and prefactor has been adjusted from the original (khairoutdinov2000) scheme to better match observations (gettelman2019b).\n#\n# #### Stochastic Collection\n#\n# We replace the KK2000 process rate equations with an estimate of the stochastic collection process from the Tel Aviv University (TAU) model. The TAU model uses a \"bin\" or \"sectional\" approach, where the drop size distribution is resolved into 35 size bins. It differs from most other microphysical codes in that it solves for two moments of the drop size distribution in each of the bins. This allows for a more accurate transfer of mass between bins and alleviates anomalous drop growth. The original components were developed by Tzivion et al. (1987), (1989), Feingold et al. (1988) with later applications and development documented in Reisin et al. (1996), Stevens et al. (1996), Feingold et al. (1999), Tzivion et al. (1999), Yin et al (2000) and Harrington et al. (2000).\n#\n# Cloud Parcel Model Documentation here: https://www.esrl.noaa.gov/csl/staff/graham.feingold/code/readme.html\n#\n# First we convert the size distributions for liquid and rain into number concentrations in individual size bins. Liquid and rain are put in the same continuous distribution of 32 size bins for the TAU code. Then we use this as input to the TAU code, running the stochastic collection kernel. The result is a revised set of 32 bins with number concentration in each bin. We the find a minimum in the distribution if present: this is always found in the case where there is rain and condensate present at the end of the calculation. The minimum is typically between 40 and 100 microns (diameter). This minimium is used to divide the bins into liquid and rain. The total number and mass in each is defined, and tendencies calculated as the final mass and number minus the initial mass and number divided by the timestep. A limiter is applied to ensure that the mass and number are non-zero, and tendencies limited to ensure this. This estimated stochastic collection tendency is then applied instead of the accretion and autoconversion tendencies.\n#\n# The code does run the accretion and autoconversion from MG2 on the same state, and we can save this off as a diagnostic, so we can directly compare the original MG2 tendency (autoconversion + accretion) with the stochastic collection tendency from the TAU code.\n#\n# The microphysics datasets contains 176 files containing\n#\n\n# ### Time span of the dataset\n# | | Datetime |\n# | ---- | :----:|\n# | Start | Jan 1 |\n# | Length | 2 years |\n#\n# ### Geographic Coverage of Dataset\n# | | Latitude | Longitude |\n# | ------------- | :----:|:----------- |\n# | Max | 90 | 358.75 |\n# | Min | -90 | 0 |\n#\n# ### Potential Input Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | QC_TAU_in | kg/kg | cloud water mixing ratio |\n# | NC_TAU_in | kg-1 | cloud droplet column concentration |\n# | QR_TAU_in | kg/kg | rain water mixing ratio |\n# | NR_TAU_in | kg-1 | rain droplet column concentration |\n# | RHO_CLUBB_lev | kg/m3 | air density at center of grid cell |\n#\n# ### Output Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | qrtend_TAU | kg/kg/s | qr tendency due to autoconversion & accretion in TAU bin |\n# | nrtend_TAU | kg/kg/s | nr tendency due to autoconversion & accretion in TAU bin |\n# | nctend_TAU | kg/kg/s | nc tendency due to autoconversion & accretion in TAU bin |\n#\n# ### Meta Variables\n# | Variable Name | Units | Description |\n# | ------------- | :----:|:----------- |\n# | lat | degrees_north | latitude |\n# | lev | hPa | atmospheric level |\n# | lon | degrees_east | longitude |\n# | depth | arbitrary | depth index |\n# | row | arbitrary | row index |\n# | col | arbitrary | column index |\n# | pressure | Pa | atmospheric pressure |\n# | temperature | K | temperature derived from pressure and density |\n# | time | days | time in days |\n# | qrtend_MG2 | kg/kg/s | qr tendency due to autoconversion & accretion in MG2 |\n# | nrtend_MG2 | kg/kg/s | nr tendency due to autoconversion & accretion in MG2 |\n# | nctend_MG2 | kg/kg/s | nc tendency due to autoconversion & accretion in MG2 |\n#\n\n# ### Training, Validation, and Test Datasets\n#\n# There are 176 files that will be split into training, validation, and test datsets via indices found in the `subset_data` variable defined below. In total, these files contain 85,263,948 data points and is randomly sampled using the `subsample` variable below.\n#\n\n#%%\n\n# set random seed\nseed = 328942\nnp.random.seed(seed)\nrandom.seed(seed)\ntf.random.set_seed(seed)\n\n#%%\n\n# define data parameters\n\ndata_path = \"ncar-aiml-data-commons/microphysics\"\nIN_COLAB = 'google.colab' in sys.modules\nif IN_COLAB:\n out_path = \"/content/gdrive/My Drive/micro_models/base\"\nelse:\n out_path = \"./micro_models/base/\"\nif not exists(out_path):\n os.makedirs(out_path)\nsubsample = 0.1\ninput_cols = [\"QC_TAU_in\", \"NC_TAU_in\",\n \"QR_TAU_in\", \"NR_TAU_in\", \"RHO_CLUBB_lev\"]\noutput_cols = [\"qrtend_TAU\", \"nctend_TAU\", \"nrtend_TAU\"]\n\nsubset_data = {\"train_date_start\": 0,\n \"train_date_end\": 11000,\n \"test_date_start\": 11100,\n \"test_date_end\": 17500}\n\ninput_scaler = StandardScaler()\ninput_transforms = {\"QC_TAU_in\": \"log10_transform\",\n \"NC_TAU_in\": \"log10_transform\",\n \"QR_TAU_in\": \"log10_transform\",\n \"NR_TAU_in\": \"log10_transform\"}\n\noutput_transforms = {\"qrtend_TAU\": {0: [\"<=\", 1e-18, \"zero_transform\", \"None\"],\n 1: [\">\", 1e-18, \"log10_transform\", \"StandardScaler\"]},\n \"nctend_TAU\": {0: [\">=\", -1e-18, \"zero_transform\", \"None\"],\n 1: [\"<\", -1e-18, \"neg_log10_transform\", \"StandardScaler\"]},\n \"nrtend_TAU\": {-1: [\"<\", 0, \"neg_log10_transform\", \"StandardScaler\"],\n 0: [\"==\", 0, \"zero_transform\", \"None\"],\n 1: [\">\", 0, \"log10_transform\", \"StandardScaler\"]}}\n\n#%%\n\n# Load data from disk or cloud\n# Separate input, output and meta data\n# Split into training, validation, and test sets\n\nprint(\"Subsetting file paths by train, validation, and test\")\ntrain_files, val_files, test_files = subset_data_files_by_date(\n data_path, **subset_data)\n\nprint(\"\\nLoading training data\")\nscaled_input_train, labels_train, transformed_out_train, scaled_out_train, output_scalers, meta_train = assemble_data_files(train_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, subsample=subsample)\n\nprint(\"\\nLoading testing data\")\nscaled_input_test, labels_test, transformed_out_test, scaled_out_test, output_scalers_test, meta_test = assemble_data_files(test_files, input_cols, output_cols, input_transforms,\n output_transforms, input_scaler, output_scalers=output_scalers,\n train=False, subsample=subsample)\n\n#%%\n\n# save meta data, input scalers, and output scalers\n\nmeta_test.to_csv(join(out_path, \"meta_test.csv\"), index_label=\"index\")\n\ninput_scaler_df = pd.DataFrame({\"mean\": input_scaler.mean_, \"scale\": input_scaler.scale_},\n index=input_cols)\ninput_scaler_df.to_csv(\n join(out_path, \"input_scale_values.csv\"), index_label=\"input\")\n\nout_scales_list = []\nfor var in output_scalers.keys():\n for out_class in output_scalers[var].keys():\n if output_scalers[var][out_class] is not None:\n out_scales_list.append(pd.DataFrame({\"mean\": output_scalers[var][out_class].mean_,\n \"scale\": output_scalers[var][out_class].scale_},\n index=[var + \"_\" + str(out_class)]))\nout_scales_df = pd.concat(out_scales_list)\nout_scales_df.to_csv(join(out_path, \"output_scale_values.csv\"),\n index_label=\"output\")\nout_scales_df\n\n#%%\n\n# Histograms of original training input data by column\n\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\ntransformed_input_train = pd.DataFrame(\n input_scaler.inverse_transform(scaled_input_train), columns=input_cols)\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(transformed_input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nfor output_col, ax in zip(output_cols, (ax1, ax2, ax3)):\n original_out_train_nc = np.zeros(scaled_out_train.shape[0])\n original_out_train_nc[labels_train[output_col] == 1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == 1, [output_col]]).ravel()\n original_out_train_nc[labels_train[output_col] == -1] = -10 ** output_scalers[output_col][1].inverse_transform(\n scaled_out_train.loc[labels_train[output_col] == -1, [output_col]]).ravel()\n ax.hist(\n np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\n ax.set_xlabel(output_col)\n ax.set_ylabel('log10')\n ax.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\n ax.set_yscale('log')\n\n#%%\n\n# Inverse transform and scaling of scaled train data\n\noriginal_out_train_nr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == 1] = 10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]]).ravel()\noriginal_out_train_nr[labels_train[\"nrtend_TAU\"] == -1] = -10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]]).ravel()\n\n\noriginal_out_train_nc = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_nc[labels_train[\"nctend_TAU\"] == 1] = -10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]]).ravel()\n\noriginal_out_train_qr = np.zeros(scaled_out_train.shape[0])\noriginal_out_train_qr[labels_train[\"qrtend_TAU\"] == 1] = 10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_train.loc[labels_train[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]]).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\n\noutput_col = \"nrtend_TAU\"\nax1.hist(\n np.log10(-original_out_train_nr[original_out_train_nr < 0]), bins=50, label=\"<0\")\nax1.hist(np.log10(\n original_out_train_nr[original_out_train_nr > 0]), bins=50, label=\">0\")\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\nax2.hist(np.log10(-original_out_train_nc[original_out_train_nc < 0]), bins=50)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\nax3.hist(np.log10(original_out_train_qr[original_out_train_qr > 0]), bins=50)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n#%%\n\n# load and view a single file\n\nfs = s3fs.S3FileSystem(anon=True)\nfilenames = fs.ls(\"s3://ncar-aiml-data-commons/microphysics\")\nfobj = fs.open(filenames[0])\nsingle_file = pd.read_parquet(fobj).set_index('Index')\nsingle_file.head()\n\n\n# ## Baseline Machine Learning Model\n# Description of baseline ML approach should include:\n# * Choice of ML software\n# * Type of ML model\n# * Hyperparameter choices and justification\n#\n# A baseline model for solving this problem uses an in-series classifier to regressor neural network architecture implemented in Keras. Initially, there are three classifier networks that feed into four regressor networks. Each classifier and regressor network has 4 hidden layers of 30 neurons each and relu activation. Those hidden layers then feed into a final output layer of size 2 or 3 for classification (1 and 0 or 1, 0, and -1) and of size 1 for regression. The classifier models are trained using the categorial crosstenropy loss function while the regression models are trained using the mean squared error loss function.\n#\n#
\n#\n\n#%%\n\n# define model hyper parameters\n\nclassifier_metrics = [\"acc\", \"pss\", \"hss\"]\nregressor_metrics = [\"mse\", \"mae\", \"r2\", \"hellinger\"]\n\nclassifier_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"categorical_crossentropy\",\n \"output_activation\": \"softmax\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 1}\n\nregressor_networks = {\"hidden_layers\": 4,\n \"hidden_neurons\": 30,\n \"loss\": \"mse\",\n \"output_activation\": \"linear\",\n \"activation\": \"relu\",\n \"epochs\": 5,\n \"batch_size\": 1024,\n \"verbose\": 1,\n \"lr\": 0.0001,\n \"l2_weight\": 1.0e-5,\n \"classifier\": 0}\n\n# hyperparameter dictionaries\nclass_metrics = {\"accuracy\": accuracy_score,\n \"heidke\": heidke_skill_score,\n \"peirce\": peirce_skill_score}\n\nreg_metrics = {\"rmse\": root_mean_squared_error,\n \"mae\": mean_absolute_error,\n \"r2\": r2_corr,\n \"hellinger\": hellinger_distance}\n\n#%%\n\n# build and fit the model\n\nhistories = {\"classifiers\": {}, \"regressors\": {}}\nclassifiers = dict()\nregressors = dict()\nreg_index = []\nfor output_col in output_cols:\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n reg_index.append(output_col + f\"_{label:d}\")\ntest_prediction_values = np.zeros((scaled_out_test.shape[0], len(reg_index)))\ntest_prediction_labels = np.zeros(scaled_out_test.shape)\nclassifier_scores = pd.DataFrame(0, index=output_cols, columns=[\n \"accuracy\", \"heidke\", \"peirce\"])\nconfusion_matrices = dict()\nreg_cols = [\"rmse\", \"mae\", \"r2\", \"hellinger\"]\nreg_scores = pd.DataFrame(0, index=reg_index, columns=reg_cols)\nl = 0\n\nfor o, output_col in enumerate(output_cols):\n print(\"Train Classifer \", output_col)\n classifiers[output_col] = DenseNeuralNetwork(**classifier_networks)\n hist = classifiers[output_col].fit(scaled_input_train,\n labels_train[output_col],\n scaled_input_test,\n labels_test[output_col])\n histories[\"classifiers\"][output_col] = hist\n classifiers[output_col].save_fortran_model(join(out_path,\n \"dnn_{0}_class_fortran.nc\".format(output_col[0:2])))\n classifiers[output_col].model.save(\n join(out_path, \"dnn_{0}_class.h5\".format(output_col[0:2])))\n regressors[output_col] = dict()\n histories[\"regressors\"][output_col] = dict()\n print(\"Evaluate Classifier\", output_col)\n test_prediction_labels[:, o] = classifiers[output_col].predict(\n scaled_input_test)\n confusion_matrices[output_col] = confusion_matrix(labels_test[output_col],\n test_prediction_labels[:, o])\n for class_score in classifier_scores.columns:\n classifier_scores.loc[output_col, class_score] = class_metrics[class_score](labels_test[output_col],\n test_prediction_labels[:, o])\n print(classifier_scores.loc[output_col])\n for label in list(output_transforms[output_col].keys()):\n if label != 0:\n print(\"Train Regressor \", output_col, label)\n regressors[output_col][label] = DenseNeuralNetwork(\n **regressor_networks)\n hist = regressors[output_col][label].fit(scaled_input_train.loc[labels_train[output_col] == label],\n scaled_out_train.loc[labels_train[output_col]\n == label, output_col],\n scaled_input_test.loc[labels_test[output_col] == label],\n scaled_out_test.loc[labels_test[output_col] == label, output_col])\n histories[\"regressors\"][output_col][label] = hist\n\n if label > 0:\n out_label = \"pos\"\n else:\n out_label = \"neg\"\n regressors[output_col][label].save_fortran_model(join(out_path,\n \"dnn_{0}_{1}_fortran.nc\".format(output_col[0:2],\n out_label)))\n regressors[output_col][label].model.save(join(out_path,\n \"dnn_{0}_{1}.h5\".format(output_col[0:2], out_label)))\n print(\"Test Regressor\", output_col, label)\n test_prediction_values[:, l] = output_scalers[output_col][label].inverse_transform(\n regressors[output_col][label].predict(scaled_input_test))\n reg_label = output_col + f\"_{label:d}\"\n for reg_col in reg_cols:\n reg_scores.loc[reg_label,\n reg_col] = reg_metrics[reg_col](transformed_out_test.loc[labels_test[output_col] == label,\n output_col],\n test_prediction_values[labels_test[output_col] == label, l])\n print(reg_scores.loc[reg_label])\n l += 1\nprint(\"Saving data\")\nclassifier_scores.to_csv(\n join(out_path, \"dnn_classifier_scores.csv\"), index_label=\"Output\")\nreg_scores.to_csv(join(out_path, \"dnn_regressor_scores.csv\"),\n index_label=\"Output\")\ntest_pred_values_df = pd.DataFrame(test_prediction_values, columns=reg_index)\ntest_pred_labels_df = pd.DataFrame(test_prediction_labels, columns=output_cols)\ntest_pred_values_df.to_csv(\n join(out_path, \"test_prediction_values.csv\"), index_label=\"index\")\ntest_pred_labels_df.to_csv(\n join(out_path, \"test_prediction_labels.csv\"), index_label=\"index\")\nlabels_test.to_csv(join(out_path, \"test_cam_labels.csv\"), index_label=\"index\")\ntransformed_out_test.to_csv(\n join(out_path, \"test_cam_values.csv\"), index_label=\"index\")\n\n#%%\n\n# visualize classifier model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['classifiers'].keys():\n plt.plot(histories['classifiers'][k]['loss'], label=f\"{k} loss\")\n plt.plot(histories['classifiers'][k]['val_loss'], label=f\"{k} val_loss\")\nplt.title('Classifier model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n#%%\n\n# visualize regressor model performance\n\nplt.figure(figsize=(10, 6))\nfor k in histories['regressors'].keys():\n for l in histories['regressors'][k].keys():\n plt.plot(histories['regressors'][k][l]\n ['loss'], label=f\"{k} label {l} loss\")\n plt.plot(histories['regressors'][k][l]['val_loss'],\n label=f\"{k} label {l} val_loss\")\nplt.title('regressor model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(loc='upper right')\nplt.show()\n\n\n# ## Metrics\n#\n# Prediction metrics by output variable for classifier networks:\n#\n# | Variable Name | accuracy | heidke | peirce |\n# | ------------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU | 0.98 | 0.97 | 0.99 |\n# | nctend_TAU | 0.99 | 0.99 | 0.97 |\n# | nrtend_TAU | 0.98 | 0.97 | 0.99 |\n#\n# Prediction metrics by output variable for regression networks:\n#\n# | Variable Name | rmse | mae | r2 | hellinger |\n# | ------------- |:----------- |:----------- |:----------- |:----------- |\n# | qrtend_TAU_1 | 0.20 | 0.10 | 0.99 | 0.00056 |\n# | nctend_TAU_1 | 0.17 | 0.08 | 1.00 | 0.00099 |\n# | nrtend_TAU_-1 | 0.20 | 0.11 | 0.99 | 0.00056 |\n# | nrtend_TAU_1 | 0.25 | 0.16 | 0.98 | 0.00018 |\n#\n#\n\n#%%\n\n# unscaled predicted output values\n\npred_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_pred_values = np.zeros(scaled_input_test.shape[0])\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == 1, [\"nrtend_TAU_1\"]])).flatten()\nnr_pred_values[test_pred_labels_df[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nrtend_TAU\"] == -1, [\"nrtend_TAU_-1\"]])).flatten()\npred_tendencies.loc[:, \"nrtend_TAU\"] = nr_pred_values\n\npred_tendencies.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"nctend_TAU\"] == 1, [\"nctend_TAU_1\"]])).ravel()\n\npred_tendencies.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n test_pred_values_df.loc[test_pred_labels_df[\"qrtend_TAU\"] == 1, [\"qrtend_TAU_1\"]])).ravel()\n\n#%%\n\n# unscaled actual output values\n\nunscaled_tendencies = pd.DataFrame(\n 0, index=scaled_out_test.index, columns=output_cols, dtype=float)\n\nnr_values = np.zeros(scaled_input_test.shape[0])\nnr_values[labels_test[\"nrtend_TAU\"] == 1] = (10 ** output_scalers[\"nrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == 1, [\"nrtend_TAU\"]])).flatten()\nnr_values[labels_test[\"nrtend_TAU\"] == -1] = (-10 ** output_scalers[\"nrtend_TAU\"][-1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nrtend_TAU\"] == -1, [\"nrtend_TAU\"]])).flatten()\nunscaled_tendencies.loc[:, \"nrtend_TAU\"] = nr_values\n\nunscaled_tendencies.loc[labels_test[\"nctend_TAU\"] == 1, \"nctend_TAU\"] = (-10 ** output_scalers[\"nctend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"nctend_TAU\"] == 1, [\"nctend_TAU\"]])).ravel()\n\nunscaled_tendencies.loc[labels_test[\"qrtend_TAU\"] == 1, \"qrtend_TAU\"] = (10 ** output_scalers[\"qrtend_TAU\"][1].inverse_transform(\n scaled_out_test.loc[labels_test[\"qrtend_TAU\"] == 1, [\"qrtend_TAU\"]])).ravel()\n\n#%%\n\n# output visualizations\n\nf, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 4))\nall_bins = [np.linspace(-16, -4, 50), np.linspace(-10,\n 6, 50), np.linspace(-200, 300, 50)]\n\noutput_col = \"nrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax1.hist(np.log10(-colp[colp < 0]), label=\"<0 pred\",\n bins=all_bins[0], color='skyblue')\nax1.hist(np.log10(colp[colp > 0]), label=\">0 pred\",\n bins=all_bins[0], color='pink')\nax1.hist(np.log10(-col[col < 0]), label=\"<0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"navy\", lw=3)\nax1.hist(np.log10(col[col > 0]), label=\">0 true\",\n bins=all_bins[0], histtype=\"step\", color=\"purple\", lw=3)\nax1.set_xlabel(output_col)\nax1.set_ylabel('log10')\nax1.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax1.set_yscale('log')\nax1.legend()\n\noutput_col = \"nctend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax2.hist(np.log10(-colp[colp < 0]), label=\"pred\",\n bins=all_bins[1], color='skyblue')\nax2.hist(np.log10(-col[col < 0]), label=\"true\",\n bins=all_bins[1], histtype=\"step\", color=\"navy\", lw=3)\nax2.set_xlabel(output_col)\nax2.set_ylabel('log10')\nax2.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax2.set_yscale('log')\n\noutput_col = \"qrtend_TAU\"\ncolp = unscaled_tendencies[output_col]\ncol = pred_tendencies[output_col]\nax3.hist(np.log10(colp[colp > 0]), label=\"pred\", color='skyblue')\nax3.hist(np.log10(col[col > 0]), label=\"true\",\n histtype=\"step\", color=\"navy\", lw=3)\nax3.set_xlabel(output_col)\nax3.set_ylabel('log10')\nax3.title.set_text(\n f\"log10-transformed {output_col} output data\\nfiltered by output_transform ops\")\nax3.set_yscale('log')\n\nplt.show()\n\n\n# ### References\n#\n#\n# Albrecht, B. A. (1989). Aerosols, cloud microphysics and fractional cloudiness.Sci-449ence,245, 1227\u20131230.\n#\n# Bodas-Salcedo, A., Mulcahy, J. P., Andrews, T., Williams, K. D., Ringer, M. A.,455Field, P. R., & Elsaesser, G. S.(2019).Strong Dependence of Atmospheric456Feedbacks on Mixed-Phase Microphysics and Aerosol-Cloud Interactions in457HadGEM3.Journal of Advances in Modeling Earth Systems,11(6), 1735\u20131758.458doi: 10.1029/2019MS001688\n#\n# Bogenschutz, P. A., Gettelman, A., Morrison, H., Larson, V. E., Craig, C., & Scha-460nen, D. P.(2013).Higher-order turbulence closure and its impact on Climate461Simulation in the Community Atmosphere Model.Journal of Climate,26(23),4629655\u20139676. doi: 10.1175/JCLI-D-13-00075.1\n#\n# Danabasoglu, G., Lamarque, J.-F., Bacmeister, J., Bailey, D. A., DuVivier, A. K.,471Edwards, J., . . . Strand, W. G.(2020).The Community Earth System Model472Version 2 (CESM2).Journal of Advances in Modeling Earth Systems,12(2),473e2019MS001916. doi: 10.1029/2019MS001916\n#\n# Forbes, R. M., & Ahlgrimm, M.(2014, September).On the Representation of475High-Latitude Boundary Layer Mixed-Phase Cloud in the ECMWF Global Model.476Monthly Weather Review,142(9), 3425\u20133445. doi: 10.1175/MWR-D-13-00325.1\n#\n# Gettelman, A.(2015, November).Putting the clouds back in aerosol\u2013cloud inter-478actions.Atmos. Chem. Phys.,15(21), 12397\u201312411.doi: 10.5194/acp-15-12397479-2015480\n#\n# Gettelman, A., Bardeen, C. G., McCluskey, C. S., & Jarvinen, E. (2020). Simulat-481ing Observations of Southern Ocean Clouds and Implications for Climate.J. Adv.482Model. Earth Syst.. doi: 10.1029/2020JD032619483\n#\n# Gettelman, A., Hannay, C., Bacmeister, J. T., Neale, R. B., Pendergrass, A. G.,484Danabasoglu, G., . . . Mills, M. J.(2019).High Climate Sensitivity in the Com-485munity Earth System Model Version 2 (CESM2).Geophysical Research Letters,48646(14), 8329\u20138337. doi: 10.1029/2019GL083978487\n#\n# Gettelman, A., & Morrison, H. (2015). Advanced Two-Moment Bulk Microphysics488for Global Models. Part I: Off-Line Tests and Comparison with Other Schemes.J.489Climate,28(3), 1268\u20131287. doi: 10.1175/JCLI-D-14-00102.1490\n#\n# Gettelman, A., Morrison, H., Santos, S., Bogenschutz, P., & Caldwell, P. M. (2015).491Advanced Two-Moment Bulk Microphysics for Global Models. Part II: Global492Model Solutions and Aerosol\u2013Cloud Interactions.J. Climate,28(3), 1288\u20131307.493doi: 10.1175/JCLI-D-14-00103.1494\n#\n# Gettelman, A., & Sherwood, S. C. (2016, October). Processes Responsible for Cloud495Feedback.Curr Clim Change Rep, 1\u201311. doi: 10.1007/s40641-016-0052-8\n#\n# Golaz, J.-C., Larson, V. E., & Cotton, W. R.(2002).A PDF-Based Model for497Boundary Layer Clouds. Part II: Model Results.J. Atmos. Sci.,59, 3552\u20133571.\n#\n# Hoose, C., Kristj \u0301ansson, J. E., Chen, J.-P., & Hazra, A. (2010, March). A Classical-499Theory-Based Parameterization of Heterogeneous Ice Nucleation by Mineral Dust,500Soot, and Biological Particles in a Global Climate Model.J. Atmos. Sci.,67(8),5012483\u20132503. doi: 10.1175/2010JAS3425.1\n#\n# Iacono, M. J., Mlawer, E. J., Clough, S. A., & Morcrette, J.-J. (2000). Impact of an503improved longwave radiation model, RRTM, on the energy budget and thermody-504namic properties of the NCAR community climate model, CCM3.jgr,105(D11),50514,873\u201314,890.\n#\n# Khairoutdinov, M. F., & Kogan, Y. (2000). A new cloud physics parameterization in507a large-eddy simulation model of marine stratocumulus.Monthly Weather Review,508128, 229\u2013243.\n#\n# Larson, V. E., Golaz, J.-C., & Cotton, W. R.(2002, December).Small-Scale and510Mesoscale Variability in Cloudy Boundary Layers: Joint Probability Density Func-511tions.J. Atmos. Sci.,59(24), 3519\u20133539. doi: 10.1175/1520-0469(2002)059\u30083519:512SSAMVI\u30092.0.CO;2\n#\n# Liu, X., & Penner, J. E. (2005). Ice Nucleation Parameterization for Global Models.514Meteor. Z.,14(499-514).\n#\n# Michibata, T., & Takemura, T.(2015, September).Evaluation of autoconversion520schemes in a single model framework with satellite observations.J. Geophys. Res.521Atmos.,120(18), 2015JD023818. doi: 10.1002/2015JD023818\n#\n# Neale, R. B., Richter, J. H., & Jochum, M.(2008).The Impact of Convection on523ENSO: From a Delayed Oscillator to a Series of Events.J. Climate,21, 5904-+.doi: 10.1175/2008JCLI2244.1\n#\n# Pruppacher, H. R., & Klett, J. D. (1997).Microphysics of Clouds and Precipitation526(Second ed.). Kluwer Academic.\n#\n# Seifert, A., & Beheng, K. D. (2001). A double-moment parameterization for simulat-531ing autoconversion, accretion and selfcollection.Atmos. Res.,59-60, 265\u2013281.\n#\n# Shi, X., Liu, X., & Zhang, K. (2015, February). Effects of pre-existing ice crystals on536cirrus clouds and comparison between different ice nucleation parameterizations537with the Community Atmosphere Model (CAM5).Atmospheric Chemistry and538Physics,15(3), 1503\u20131520. doi: 10.5194/acp-15-1503-2015\n#\n# Twomey, S. (1977). The influence of pollution on the shortwave albedo of clouds.J.553Atmos. Sci.,34(7), 1149\u20131152.\n#\n# Wang, Y., Liu, X., Hoose, C., & Wang, B.(2014, October).Different contact555angle distributions for heterogeneous ice nucleation in the Community Atmo-556spheric Model version 5.Atmos. Chem. Phys.,14(19), 10411\u201310430.doi:55710.5194/acp-14-10411-2014\n#\n# Zhang, G. J., & McFarlane, N. A. (1995). Sensitivity of climate simulations to the559parameterization of cumulus convection in the Canadian Climate Center general560circulation model.Atmos. Ocean,33, 407\u2013446.\n\n# ## Hackathon Challenges\n#\n# ### Monday\n# * Load the data\n# * Create an exploratory visualization of the data\n# * Test two different transformation and scaling methods\n# * Test one dimensionality reduction method\n# * Train a linear model\n# * Train a decision tree ensemble method of your choice\n\n#%%\n\n# Monday's code starts here\n\n# Import transform methods to test\n\n\n# Read in files, modified from code in data.py to drop application of transform step\n\n#%%\n\n# New version of assemble that reads in data without applying scaling or transformation, so that they can be tested without\n# reloading every time\ndef assemble_data_files_no_transform(files, input_cols, output_cols, train=True, subsample=1,\n meta_cols=(\"lat\", \"lon\", \"lev\", \"depth\", \"row\", \"col\", \"pressure\", \"temperature\",\n \"time\", \"qrtend_MG2\", \"nrtend_MG2\", \"nctend_MG2\")):\n \"\"\"\n This function loads data from a list of files\n Args:\n files: List of files being loaded\n input_cols: List of input columns for training the neural networks\n output_cols: List of output columns\n train: Whether to fit the Scaler objects or\n subsample:\n meta_cols:\n Returns:\n \"\"\"\n all_input_data = []\n all_output_data = []\n all_meta_data = []\n for i, filename in enumerate(files):\n if i % 10 == 0:\n print(\n f\"Finished loading {i}/{len(files)} files... opening file {filename}\")\n data = open_data_file(filename)\n if subsample < 1:\n sample_index = int(np.round(data.shape[0] * subsample))\n sample_indices = np.sort(np.random.permutation(\n np.arange(data.shape[0]))[:sample_index])\n else:\n sample_indices = np.arange(data.shape[0])\n all_input_data.append(data.loc[sample_indices, input_cols])\n all_output_data.append(data.loc[sample_indices, output_cols])\n all_meta_data.append(data.loc[sample_indices, meta_cols])\n del data\n print(\"Combining data\")\n combined_input_data = pd.concat(all_input_data, ignore_index=True)\n combined_output_data = pd.concat(all_output_data, ignore_index=True)\n combined_meta_data = pd.concat(all_meta_data, ignore_index=True)\n print(\"Combined Data Size\", combined_input_data.shape)\n del all_input_data[:]\n del all_output_data[:]\n\n return combined_input_data, combined_output_data, combined_meta_data\n\n\n# Function to read in parquet files\n\n#%%\n\ndef open_data_file(filename):\n if \"ncar-aiml-data-commons/microphysics\" in filename:\n fs = s3fs.S3FileSystem(anon=True)\n fobj = fs.open(filename)\n ds = pd.read_parquet(fobj).set_index('Index')\n return ds\n else:\n ds = pd.read_csv(filename, index_col=\"Index\")\n return ds\n\n#%%\n\ninput_train, output_train, meta_train = assemble_data_files_no_transform(\n train_files, input_cols, output_cols)\n\n#%%\n\ninput_test, output_test, meta_test = assemble_data_files_no_transform(\n test_files, input_cols, output_cols)\n\n\n# Let's take a look at the input training data without any kind of transformation or scaling\n\n#%%\n\n# Histograms of training input data by column\nfig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_train[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n\n\n# Let's take a look at the input test data to make sure that our sample isn't pathological in some fashion\n\n#%%", "original_comment": "# Histograms of test input data by column\n", "target_code": "fig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(input_test[input_cols[a]], bins=20)\n ax.set_title(input_cols[a])\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig, axes = plt.subplots(1, 5, figsize=(20, 3))\nfor a, ax in enumerate(axes.ravel()):\n if a < len(input_cols):\n ax.set_yscale(\"log\")\n ax.hist(output_train[output_cols[a]], bins=20)\n ax.set_title(output_cols[a])\n", "model": "docstring", "intent": "# Histograms of test input data by column"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Collaborative Filtering\n#\n# **By Li-Yen Hsu (11/10/2017)**\n#

\n# The goal of this project is to predict the ratings that would be given by each consumer for the restaurants he/she has not rated. A list of restaurants with the highest predicted ratings can then be recommended to each consumer. Because the restaurant ratings are numerical, predicting their values can be treated as a regression problem. Using classification techniques is also reasonable since the ratings are 0, 1 or 2 in this dataset. However, a multi-class classification will likely predict too many ties and therefore prevent us from generating a top-n list of recommendations for a consumer. Rather than predicting the exact values of ratings that a consumer would give to certain restaurants, what is more important for a recommender system is predicting the ranking of these restaurants for the consumer. Thus, I will attempt to predict continuous values in this notebook.\n#

\n# I will use matrix factorization-based algorithms for rating prediction. The fundamental concepts are that each item is characterized by a vector of features; each consumer preference is described by a vector of weights which has the same dimension as the item features; and the predicted rating of a item-consumer pair equals the inner product of the two vectors. The mathematics is therefore equivalent to a simple linear regression. For a content-based approach, the features are already determined based on the product information, leaving the weights to be optimized. For collaborative filtering, both the features and weights are the parameters to be optimized. I will use the latter in this notebook. Instead of using an existing library for recommender systems, I will implement the cost functions of the problem and perform optimization using \"minimize()\" from the SciPy package. But I will also run a [SVD model](http://sifter.org/simon/journal/20061211.html) using [Surprise](http://surpriselib.com/) at the end and compare its performance with mine.\n\n#%%\n\nfrom surprise.dataset import Reader, Dataset\nfrom surprise import SVD\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport random\nfrom sklearn.metrics import mean_squared_error\nfrom scipy.optimize import minimize\n\n# Since I will use collaborative filtering approaches, only the csv file\n# for the ratings is needed\nrating = pd.read_csv('data/rating_final.csv')\n\n\n# ## Data Preprocessing", "original_comment": "# Let's first look at the data frame.\n", "target_code": "rating.head()\n", "project_metadata": {"full_name": "liyenhsu/restaurant-data-with-consumer-ratings", "description": "Build recommender systems for restaurants", "topics": [], "git_url": "git://github.com/liyenhsu/restaurant-data-with-consumer-ratings.git", "stars": 3, "watchers": 3, "forks": 4, "created": "2017-11-09T05:11:58Z", "size": 1373, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1230183}, "last_updated": "2020-10-11T20:40:42Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "rating.head(5)\nrating.shape\n", "model": "natural", "intent": "# look at the data frame"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## VGG19+Xception\n# #### Detailed running processes are avaibale at private Kaggle Kernel, an attached pdf screenshots show this notebook is runnable\n# #### Reference: https://www.kaggle.com/atrisaxena/keras-plant-seedlings-vgg19-augmentation\n\n#%%\n\nfrom keras.models import Sequential, Model\nfrom keras.layers import BatchNormalization\nfrom keras.layers.convolutional import MaxPooling2D\nfrom keras.layers.convolutional import Conv2D\nfrom keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\nfrom keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\nfrom keras import optimizers\nfrom keras import applications\nfrom keras.utils import np_utils\nimport numpy as np\nfrom mpl_toolkits.axes_grid1 import ImageGrid\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom keras.preprocessing.image import ImageDataGenerator\nimport cv2\nfrom IPython.core.interactiveshell import InteractiveShell\nimport os\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport numpy as np # linear algebra\nimport warnings\nwarnings.filterwarnings('ignore')\nprint(os.listdir(\"../input\"))\nInteractiveShell.ast_node_interactivity = \"all\"\n\n#%%\n\nCATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',\n 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']\nNUM_CATEGORIES = len(CATEGORIES)\n\n#%%\n\nSEED = 123\ndata_dir = '../input/'\ntrain_dir = os.path.join(data_dir, 'train')\ntest_dir = os.path.join(data_dir, 'test')\nsample_submission = pd.read_csv(\n os.path.join(data_dir, 'sample_submission.csv'))\n\n\n# ### Number of training images for each Category\n\n#%%\n\nfor category in CATEGORIES:\n print('{} {} images'.format(category, len(\n os.listdir(os.path.join(train_dir, category)))))\n\n#%%\n\ntrain = []\nfor category_id, category in enumerate(CATEGORIES):\n for file in os.listdir(os.path.join(train_dir, category)):\n train.append(\n ['train/{}/{}'.format(category, file), category_id, category])\ntrain = pd.DataFrame(train, columns=['file', 'category_id', 'category'])\ntrain.head(2)\ntrain.shape\n\n#%%\n\ntest = []\nfor file in os.listdir(test_dir):\n test.append(['test/{}'.format(file), file])\ntest = pd.DataFrame(test, columns=['filepath', 'file'])\ntest.head(2)\ntest.shape\n\n\n# ## Model Structure\n\n#%%\n\nscale = 299\n\nmodel = applications.VGG19(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\nadd_model = applications.Xception(\n weights=\"imagenet\", include_top=False, input_shape=(scale, scale, 3))\n\nmodel = Sequential()\nmodel.add(add_model)\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu',\n input_shape=(scale, scale, 3)))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(Conv2D(20, kernel_size=(3, 3), activation='relu'))\nmodel.add(BatchNormalization(axis=3))\nmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))\nmodel.add(Dropout(0.2))\n\n\nmodel.add(Flatten())\nmodel.add(Dense(256, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(64, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(12, activation='softmax'))", "original_comment": "# compling and show model\n", "target_code": "from keras import optimizers\n\nmodel.compile(loss=\"categorical_crossentropy\", optimizer=optimizers.SGD(\n lr=0.0001, momentum=0.9), metrics=[\"accuracy\"])\nmodel.summary()\n", "project_metadata": {"full_name": "WuZhuoran/Plant_Seedlings_Classification", "description": "Kaggle Competition Project as well as ANLY 590 Final Project. Task: Determine the species of a seedling from an image", "topics": [], "git_url": "git://github.com/WuZhuoran/Plant_Seedlings_Classification.git", "stars": 10, "watchers": 10, "forks": 7, "created": "2018-10-31T01:19:27Z", "size": 10167, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2140227, "Python": 31477}, "last_updated": "2020-12-18T16:42:52Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "precision": "Strongly agree", "precision-score": 3, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "model.compile(loss='categorical_crossentropy',\n optimizer='adam', metrics=['accuracy'])\n", "model": "docstring", "intent": "# compling and show model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n\ndf_reviews.head()\n\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Beer Analysis\n# ---\n#\n# Notebook to get insights from the dataset for beers, breweries and reviews.\n#\n# The dataset came in 3 different CSV files: beers.csv, breweries.csv and reviews.csv that has around 9 Million reviews!\n#\n# #### Data Source: [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv)\n#\n# The data comes from [BeerAdvocate](https://www.beeradvocate.com)\n\n#%%\n\n# Dependencies and packages\nimport plotly.io as pio\nimport plotly.graph_objects as go\nimport plotly.express as px\nimport plotly\nimport datetime as dt\nimport math as math\nimport numpy as np\nimport pandas as pd\nimport os\nget_ipython().run_line_magic('reload_ext', 'lab_black')\n\n#%%\n\ncsv_path = os.path.join(\"../data/csv/beers.csv\")\ndf_beers = pd.read_csv(csv_path)\n\ndf_beers.head()\n\n#%%\n\ndf_beers = df_beers.drop([\"notes\"], axis=1)\ndf_beers.head()\n\n#%%\n\ncsv_path2 = os.path.join(\"../data/csv/breweries.csv\")\ndf_breweries = pd.read_csv(csv_path2)\n\ndf_breweries.head()\n\n#%%\n\ndf_breweries = df_breweries.drop([\"notes\"], axis=1)\ndf_breweries.head()\n\n#%%\n\ncsv_path3 = os.path.join(\"../data/csv/reviews.csv\")\ndf_reviews = pd.read_csv(csv_path3)\n\n#%%\n\ndf_reviews.head()\n\n#%%\n\nprint(df_beers.shape)\nprint(df_breweries.shape)\nprint(df_reviews.shape)\n\n#%%\n\n# Find the number of unique beers in reviews.csv\ndf_reviews[\"beer_id\"].value_counts()\n\n#%%\n\n# Find unique users who reviewed\ndf_reviews[\"username\"].value_counts()\n\n#%%\n\n# Find unique value counts for every column in beers.csv\ndf_beers.apply(lambda x: x.isnull().value_counts())\n\n#%%\n\n# Find value counts for every column in breweries.csv\ndf_breweries.apply(lambda x: x.isnull().value_counts())\n\n#%%", "original_comment": "# Find value counts for every column in reviews.csv\n", "target_code": "df_reviews.apply(lambda x: x.isnull().value_counts())\n", "project_metadata": {"full_name": "sheetalbongale/ALE-gorithm", "description": "All things Beer! Beer Educator and Recommender Web App | Deployed on GCP > https://alegorithm-fxljyqhslq-uc.a.run.app/ | UT Data Analysis and Visualization Nov 2019 - May 2020. ", "topics": ["recommender", "gcp-cloud-build", "python-flask-application", "sqlalchemy", "plotlyjs", "anychart-javascript-library", "d3js", "mysql"], "git_url": "git://github.com/sheetalbongale/ALE-gorithm.git", "stars": 5, "watchers": 5, "forks": 5, "created": "2020-03-01T22:59:58Z", "size": 56307, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 204948, "JavaScript": 52038, "CSS": 48412, "HTML": 46213, "Python": 15403, "Dockerfile": 433}, "last_updated": "2020-05-07T08:39:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "precision": "Disagree", "precision-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "usefulness": "Agree", "usefulness-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df_reviews[\"username\"].value_counts()\n", "model": "docstring", "intent": "# Find value counts for every column in reviews.csv"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom __future__ import print_function\nimport argparse\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.autograd import Variable\n\nimport os\nimport numpy as np\nimport h5py\nimport time\n\nimport torch_utils\nimport data_utils\n\nimport librosa\nfrom sklearn.cluster import KMeans\n\n#%%\n\n# global params\n\nparser = argparse.ArgumentParser(description='DANet')\nparser.add_argument('--batch-size', type=int, default=128,\n help='input batch size for training (default: 128)')\nparser.add_argument('--epochs', type=int, default=100,\n help='number of epochs to train (default: 100)')\nparser.add_argument('--cuda', action='store_true', default=True,\n help='enables CUDA training (default: True)')\nparser.add_argument('--seed', type=int, default=20170220,\n help='random seed (default: 20170220)')\nparser.add_argument('--infeat-dim', type=int, default=129,\n help='dimension of the input feature (default: 129)')\nparser.add_argument('--outfeat-dim', type=int, default=20,\n help='dimension of the embedding (default: 20)')\nparser.add_argument('--threshold', type=float, default=0.9,\n help='the weight threshold (default: 0.9)')\nparser.add_argument('--seq-len', type=int, default=100,\n help='length of the sequence (default: 100)')\nparser.add_argument('--log-step', type=int, default=100,\n help='how many batches to wait before logging training status (default: 100)')\nparser.add_argument('--lr', type=float, default=1e-3,\n help='learning rate (default: 1e-3)')\nparser.add_argument('--num-layers', type=int, default=4,\n help='number of stacked RNN layers (default: 1)')\nparser.add_argument('--bidirectional', action='store_true', default=True,\n help='whether to use bidirectional RNN layers (default: True)')\nparser.add_argument('--val-save', type=str, default='model.pt',\n help='path to save the best model')\n\nargs, _ = parser.parse_known_args()\nargs.cuda = args.cuda and torch.cuda.is_available()\nargs.num_direction = int(args.bidirectional)+1\n\ntorch.manual_seed(args.seed)\nif args.cuda:\n torch.cuda.manual_seed(args.seed)\n kwargs = {'num_workers': 1, 'pin_memory': True}\nelse:\n kwargs = {}\n\n# STFT parameters\nsr = 8000\nnfft = 256\nnhop = 64\nnspk = 2\n\n#%%\n\n# define model\n\nclass DANet(nn.Module):\n def __init__(self):\n super(DANet, self).__init__()\n\n self.rnn = torch_utils.MultiRNN('LSTM', args.infeat_dim, 300,\n num_layers=args.num_layers,\n bidirectional=args.bidirectional)\n self.FC = torch_utils.FCLayer(\n 600, args.infeat_dim*args.outfeat_dim, nonlinearity='tanh')\n\n self.infeat_dim = args.infeat_dim\n self.outfeat_dim = args.outfeat_dim\n self.eps = 1e-8\n\n def forward(self, input, hidden):\n \"\"\"\n input: the input feature; \n shape: (B, T, F)\n\n hidden: the initial hidden state in the LSTM layers.\n \"\"\"\n\n seq_len = input.size(1)\n\n # generate the embeddings (V) by the LSTM layers\n LSTM_output, hidden = self.rnn(input, hidden)\n LSTM_output = LSTM_output.contiguous().view(-1, LSTM_output.size(2)) # B*T, H\n V = self.FC(LSTM_output) # B*T, F*K\n V = V.view(-1, seq_len*self.infeat_dim, self.outfeat_dim) # B, T*F, K\n\n return V\n\n def init_hidden(self, batch_size):\n return self.rnn.init_hidden(batch_size)\n\n#%%\n\n# load model\nmodel = DANet()\nmodel.load_state_dict(torch.load('model.pt'))\n\nif args.cuda:\n model.cuda()\nmodel.eval()\n\n#%%\n\n# load mixture data\nmix, _ = librosa.load('your_path_to_mixture_audio', sr=sr)\n\n# STFT\nmix_spec = librosa.stft(mix, nfft, nhop) # F, T\nmix_phase = np.angle(mix_spec) # F, T\nmix_spec = np.abs(mix_spec) # F, T\n\n# magnitude spectrogram in db scale\ninfeat = 20*np.log10(mix_spec.T)\ninfeat = np.asarray([infeat]*1)\n# optional: normalize the input feature with your pre-calculated\n# statistics of the training set\n\nbatch_infeat = Variable(torch.from_numpy(infeat)).contiguous()\nif args.cuda:\n batch_infeat = batch_infeat.cuda()\n\nwith torch.no_grad():\n hidden = model.init_hidden(batch_infeat.size(0))\n embeddings = model(batch_infeat, hidden)\n\n# estimate attractors via K-means\nembeddings = embeddings[0].data.cpu().numpy() # T*F, K\nkmeans_model = KMeans(n_clusters=nspk, random_state=0).fit(\n embeddings.astype('float64'))\nattractor = kmeans_model.cluster_centers_ # nspk, K\n\n# estimate masks\nembeddings = torch.from_numpy(embeddings).float() # T*F, K\nattractor = torch.from_numpy(attractor.T).float() # K, nspk\nif args.cuda:\n embeddings = embeddings.cuda()\n attractor = attractor.cuda()\n\nmask = F.softmax(torch.mm(embeddings, attractor), dim=1) # T*F, nspk\nmask = mask.data.cpu().numpy()\n\nmask_1 = mask[:, 0].reshape(-1, args.infeat_dim).T\nmask_2 = mask[:, 1].reshape(-1, args.infeat_dim).T\n\n# masking the mixture magnitude spectrogram\ns1_spec = (mix_spec * mask_1) * np.exp(1j*mix_phase)\ns2_spec = (mix_spec * mask_2) * np.exp(1j*mix_phase)\n\n# reconstruct waveforms\nres_1 = librosa.istft(s1_spec, hop_length=nhop, win_length=nfft)\nres_2 = librosa.istft(s2_spec, hop_length=nhop, win_length=nfft)\n\nif len(res_1) < len(mix):", "original_comment": " # pad zero at the end\n", "target_code": " res_1 = np.concatenate([res_1, np.zeros(len(mix)-len(res_1))])\n", "project_metadata": {"full_name": "naplab/DANet", "description": "Deep Attractor Network (DANet) for single-channel speech separation", "topics": [], "git_url": "git://github.com/naplab/DANet.git", "stars": 53, "watchers": 53, "forks": 15, "created": "2018-09-18T21:26:22Z", "size": 11, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23340, "Python": 4814}, "last_updated": "2020-12-14T07:40:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Agree", "compatibility-score": 2, "precision": "Strongly disagree", "precision-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "if args.cuda:\n res_1 = res_1.cuda()\nif len(res_2) < len(mix):\n res_2 = res_2.cuda()\nif len(mask) < len(mix):\n mask = mask.cuda()\nif len(mask_1) < len(mix):\n mask_1 = mask_1.cuda()\nif len(mask_2) < len(mix):\n mask_2 = mask_2.cuda()\n", "model": "natural", "intent": " # pad zero at the end"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\nsavelist # list of lists\n\n\n# we can check the items of the list we created\nsavelist[1]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Welcome to Session 4\n\n# ## Session Outline\n#\n# - loading data in different ways\n# - manipulating data structures\n# - removing stopwords\n# - Building a cleaning Pipeline\n\n# # Loading Data\n\n# ### What is the difference between\n# - read()\n# - readlines()\n# - readline()\n\n#%%\n\nfrom collections import Counter\nimport string\nimport re # the re\nimport codecs\nfrom nltk.corpus import stopwords\nimport re\nimport os\nimport glob\nimport csv\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile = file1.read()\nfile\n\n#%%\n\nfile1 = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt', 'r')\nfile1.readline()\n\n#%%\n\nfile1.readlines()\n\n\n# ## Context manager\n#\n# use this instead open and close\n#\n#\n\n#%%\n\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/ihaveadream.txt\"\n\nwith open(filepath, \"r\") as infile:\n content = infile.read()\n\nprint(content)\n\n\n# ### Lets try opening another file\n\n#%%\n\nsputnik = open('/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv', \"r\") # open files\n# a lst of strings, each string refers to one line in the file\nlines_sputnik = sputnik.readlines()\nfor i in range(5): # read line by line\n print(lines_sputnik[i])\nsputnik.close()\n\n\n# ## Other ways to load data: CSV library\n#\n#\n# Good for csv and tsv files\n#\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nread_tsv\n\n#%%\n\nfor row in read_tsv:\n print(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n\n# what if I want to save this in a list to be able to access later.\n#\n# Here loops become handy\n\n#%%\n\ntsv_file = open(\"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\")\nread_tsv = csv.reader(tsv_file, delimiter=\"\\t\")\nsavelist = []\nfor row in read_tsv:\n savelist.append(row)\ntsv_file.close() # we see here each row is now saved in a list\n\n#%%\n\nsavelist # list of lists\n\n#%%\n\n# we can check the items of the list we created\nsavelist[1]\n\n#%%", "original_comment": "# access items from a list\n", "target_code": "savelist[0][1]\n", "project_metadata": {"full_name": "aelshehawy/Computational-Text-Analysis-for-Political-Science", "description": null, "topics": [], "git_url": "git://github.com/aelshehawy/Computational-Text-Analysis-for-Political-Science.git", "stars": 9, "watchers": 9, "forks": 10, "created": "2020-05-02T16:01:07Z", "size": 55280, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 75215571}, "last_updated": "2020-06-28T18:31:38Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "filepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sputnikgerman20.tsv\"\nwith open(filepath, \"r\") as infile:\n reader = csv.reader(infile, delimiter=\"\\t\")\n for row in reader:\n print(row)\nfilepath = \"/Users/Ashrakat/Dropbox/University/Oxford/Jobs/Teaching/Text Analysis/code/Basics-of-Text-Analysis-for-Political-Science/Data/sput\n", "model": "natural", "intent": "# access items from a list"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n#%%\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n#%%\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n#%%\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)", "original_comment": "# print out all data shapes\n", "target_code": "print('Training set', train_dataset.shape, train_labels.shape)\nprint('Validation set', validation_dataset.shape, validation_labels.shape)\nprint('Test set', test_dataset.shape, test_labels.shape)\n", "project_metadata": {"full_name": "cwru-robotics/cwru_dnn", "description": "deep neural net explorations", "topics": [], "git_url": "git://github.com/cwru-robotics/cwru_dnn.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-07-25T14:47:31Z", "size": 49625, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 329694, "Python": 19000, "C++": 17781, "CMake": 7310}, "last_updated": "2020-03-13T14:59:53Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print('Training set', train_dataset.shape, train_labels.shape)\nprint('Validation set', validation_dataset.shape, validation_labels.shape)\nprint('Test set', test_dataset.shape, test_labels.shape)\n", "model": "natural", "intent": "# print out all data shapes"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\ndef frequencyDists(wine_set):\n print(\"This is the frequency distribution of the wines' quality.\")\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n#%%\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n#%%\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n#%%\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n#%%\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n#%%\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n#%%\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n#%%\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\n#%%", "original_comment": "# print frequency distributions of wines' quality\n", "target_code": " print(wine_set.groupby(\"quality\").size()*100 / len(wine_set))\n", "project_metadata": {"full_name": "shrikant-temburwar/Wine-Quality-Dataset", "description": null, "topics": [], "git_url": "git://github.com/shrikant-temburwar/Wine-Quality-Dataset.git", "stars": 7, "watchers": 7, "forks": 13, "created": "2018-06-11T14:03:02Z", "size": 575, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 670078}, "last_updated": "2020-12-16T12:41:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "wine_set = pd.read_csv('winequality-red.csv', sep=';')\nwine_set.head()\n", "model": "no-comments", "intent": " # print frequency distributions of wines' quality"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n\n# Check shape.\ncaravan_df_raw.shape\n\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Predicting Caravan Insurance Purchases #\n\n# This project explores a dataset compiled by *Caravan Insurance* about whether customers purchased their mobile home insurance. Using various classification models, we will attempt to predict whether a person will purchase Caravan insurance (as opposed to another company's comparable insurance) based on demographic information.\n#\n# NOTE: After looking over my results, I suspect there may be some data leakage going on. The next step for this project is to re-run the analysis ensuring no leakage occurs.\n\n# ## Contents ##\n# * **Part 1: Data Exploration**\n# * Initial Data Exploration\n# * **Part 2: Test, Train, Split**\n# * Ratio-Preserved Data\n# * **Part 3: Classify**\n# * Logistic Regression, K-NN, LDA, QDA, and SVC with:\n# * Un-Normalized, Ratio-Preserved\n# * Normalized Data\n# * Under Sampling\n# * Over Sampling\n# * SMOTE\n# * PCA + SMOTE\n# * **Part 4: Features**\n# * Removing Signal-Less Features\n# * Fewer Features + Over Sampling\n\n# ## Part 1: Data Exploration ##\n\n#%%\n\nfrom sklearn.decomposition import PCA\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn.svm import SVC\nfrom sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\nfrom sklearn.cross_validation import train_test_split\nfrom sklearn.metrics import roc_curve, auc, classification_report\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.preprocessing import StandardScaler\nimport math\nimport warnings\nimport missingno as msno\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\nwarnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))\nnp.random.seed(9)\n\n\n# ### Initial Data Exploration ###\n\n#%%\n\n# Read in the data.\ncaravan_df_raw = pd.read_csv('caravan.csv')\n\n#%%\n\n# Check shape.\ncaravan_df_raw.shape\n\n#%%\n\n# Sample of the data.\ncaravan_df_raw.head(3)\n\n#%%\n\n# List out all columns.\ncaravan_df_raw.columns\n\n\n# The features are not explicit and so we will need to forge ahead without a clear understanding of the collection of predictors we are working with.\n#\n# **`Purchase`** is our binary outcome variable, and represents whether or not the customer bought Caravan insurance:\n#\n# Yes = Purchased\n# No = Did Not Purchase\n#\n# This is what we ultimately hope to predict.\n\n#%%\n\n# Visualize missing data.\nmsno.matrix(caravan_df_raw)\n\n#%%\n\n# Ensure there are no missing values.\nprint('Missing values: %i' % caravan_df_raw.isnull().sum().sum())\n\n#%%\n\n# Find out which columns are numeric.\nnumeric_df = caravan_df_raw.select_dtypes(include=[np.number])\nnumeric_df.shape\n\n\n# `85` out of `86` columns are numeric, and we know that `Purchase` is not, so `Purchase` must be the only non-numeric column.\n\n#%%\n\n# Purchase has string values.\ncaravan_df_raw['Purchase'].value_counts()\n\n#%%\n\n# Change string values to binary.\ncaravan_df_raw['Purchase'] = caravan_df_raw['Purchase'].factorize()[0]\ncaravan_df_raw['Purchase'].value_counts()\n\n\n# We'll now do some exploratory data analysis on the features, but we'll keep the outcome variable in the dataset.\n\n#%%", "original_comment": "# Look globally at correlation of features.\n", "target_code": "corr = caravan_df_raw.corr()\n", "project_metadata": {"full_name": "jonrossi/caravan-insurance", "description": "Exploration and analysis of the Caravan Insurance dataset", "topics": [], "git_url": "git://github.com/jonrossi/caravan-insurance.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-09-23T17:40:57Z", "size": 951, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1260942}, "last_updated": "2020-10-31T21:58:03Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "corr = caravan_df_raw.corr()\nsns.heatmap(corr)\n", "model": "docstring", "intent": "# Look globally at correlation of features."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n\nprint(spam.head())\n\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n\nprint(data_set[:5])\n\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n\nprint(training_set[:5])\n\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### SPAM Ham Detection\n\n#%%\n\nimport pickle\nimport random\nimport nltk\nimport pandas as pd\nfrom nltk.tokenize import word_tokenize\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer\nfrom nltk.stem import WordNetLemmatizer\n\n#%%\n\n# Reading the given dataset\nspam = pd.read_csv(\"SMSSpamCollection.txt\", sep=\"\\t\",\n names=[\"label\", \"message\"])\n\n#%%\n\nprint(spam.head())\n\n#%%\n\n# Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label\ndata_set = []\nfor index, row in spam.iterrows():\n data_set.append((row['message'], row['label']))\n\n#%%\n\nprint(data_set[:5])\n\n#%%\n\nprint(len(data_set))\n\n\n# ### Preprocessing\n\n#%%\n\n# initialise the inbuilt Stemmer and the Lemmatizer\nstemmer = PorterStemmer()\nwordnet_lemmatizer = WordNetLemmatizer()\n\n#%%\n\ndef preprocess(document, stem=True):\n 'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'\n\n # change sentence to lower case\n document = document.lower()\n\n # tokenize into words\n words = word_tokenize(document)\n\n # remove stop words\n words = [word for word in words if word not in stopwords.words(\"english\")]\n\n if stem:\n words = [stemmer.stem(word) for word in words]\n else:\n words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]\n\n # join words to make sentence\n document = \" \".join(words)\n\n return document\n\n#%%\n\n# - Performing the preprocessing steps on all messages\nmessages_set = []\nfor (message, label) in data_set:\n words_filtered = [e.lower() for e in preprocess(\n message, stem=False).split() if len(e) >= 3]\n messages_set.append((words_filtered, label))\n\n#%%\n\nprint(messages_set[:5])\n\n\n# ### Preparing to create features\n\n#%%\n\n# - creating a single list of all words in the entire dataset for feature list creation\n\ndef get_words_in_messages(messages):\n all_words = []\n for (message, label) in messages:\n all_words.extend(message)\n return all_words\n\n#%%\n\n# - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words\n# Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.\n\ndef get_word_features(wordlist):\n\n # print(wordlist[:10])\n wordlist = nltk.FreqDist(wordlist)\n word_features = wordlist.keys()\n return word_features\n\n#%%\n\n# - creating the word features for the entire dataset\nword_features = get_word_features(get_words_in_messages(messages_set))\nprint(len(word_features))\n\n\n# ### Preparing to create a train and test set\n\n#%%\n\n# - creating slicing index at 80% threshold\nsliceIndex = int((len(messages_set)*.8))\n\n#%%\n\n# - shuffle the pack to create a random and unbiased split of the dataset\nrandom.shuffle(messages_set)\n\n#%%\n\ntrain_messages, test_messages = messages_set[:\n sliceIndex], messages_set[sliceIndex:]\n\n#%%\n\nlen(train_messages)\nlen(test_messages)\n\n\n# ### Preparing to create feature maps for train and test data\n\n#%%\n\n# creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages\ndef extract_features(document):\n document_words = set(document)\n features = {}\n for word in word_features:\n features['contains(%s)' % word] = (word in document_words)\n return features\n\n#%%\n\n# - creating the feature map of train and test data\n\ntraining_set = nltk.classify.apply_features(extract_features, train_messages)\ntesting_set = nltk.classify.apply_features(extract_features, test_messages)\n\n#%%\n\nprint(training_set[:5])\n\n#%%\n\nprint('Training set size : ', len(training_set))\nprint('Test set size : ', len(testing_set))\n\n\n# ### Training\n\n#%%", "original_comment": "# Training the classifier with NaiveBayes algorithm\n", "target_code": "spamClassifier = nltk.NaiveBayesClassifier.train(training_set)\n", "project_metadata": {"full_name": "beingdatum/NaturalLanguageProcessing", "description": null, "topics": [], "git_url": "git://github.com/beingdatum/NaturalLanguageProcessing.git", "stars": 3, "watchers": 3, "forks": 10, "created": "2020-01-01T13:54:22Z", "size": 23376, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2267856, "Python": 1378}, "last_updated": "2020-06-08T09:54:47Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", "model": "no-comments", "intent": "# Training the classifier with NaiveBayes algorithm"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n\n# Checking for data types\n# fraud_df.dtypes\n\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n\n#fraud_df = fraud_df.head(10000)\n\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n\nfraud_df.head()\n\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n\nfraud_df[cat_col_onehotencode].head()\n\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n\nfraud_df.head()\n\n\n# fraud_df.columns\n\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Assignment Week 2 - Group 5\n#\n# ### Noelani Roy, Yihong Qiu, Cosimo Cambi, Craig Perkins\n\n# # Data Preparation\n\n# ## Data Selection\n\n# ### Import libraries\n\n#%%\n\n# main libraries\nimport pandas as pd\nimport numpy as np\nfrom datetime import date\nimport geopy.distance\nfrom math import sin, cos, sqrt, atan2, radians, log\nimport imblearn\nfrom numpy import mean, where\nfrom collections import Counter\nimport qgrid\n\n# visual libraries\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\n# sklearn libraries\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.decomposition import PCA\nfrom sklearn.datasets import make_classification\nfrom sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold\nfrom sklearn.tree import DecisionTreeClassifier\nfrom imblearn.pipeline import Pipeline\nfrom imblearn.over_sampling import SMOTE\nfrom imblearn.under_sampling import RandomUnderSampler\n\n\n# ### Read data\n\n#%%\n\ndf_1 = pd.read_csv(\"../fraudTrain.csv\")\ndf_2 = pd.read_csv(\"../fraudTest.csv\")\nfraud_df = df_1.append(df_2)\n\n#%%\n\nfraud_df.shape\n# Craig I noticed that I started out with 23 columns and you started out with 28\n# Did you do something to you files before loading them?\n# What does everyone else have?\n\n\n# ## Data Preprocessing\n\n#%%\n\n# Checking for missing values\nfraud_df.isnull().any().sum()\n\n#%%\n\n# Checking for data types\n# fraud_df.dtypes\n\n#%%\n\n# Checking for unique values\n# fraud_df.nunique()\n\n\n# ### Formatting and cleansing\n\n#%%\n\n#fraud_df = fraud_df.head(10000)\n\n#%%\n\ndef calculate_age(born):\n today = date.today()\n return today.year - born.year - ((today.month, today.day) < (born.month, born.day))\n\n\ndef calculate_distance(row):\n coords_1 = (row['lat'], row['long'])\n coords_2 = (row['merch_lat'], row['merch_long'])\n return geopy.distance.geodesic(coords_1, coords_2).km\n\n# Answer from https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n# The answers above are based on the Haversine formula, which assumes the earth is a sphere,\n# which results in errors of up to about 0.5% (according to help(geopy.distance)).\n# Vincenty distance uses more accurate ellipsoidal models such as WGS-84, and is implemented in geopy. For example,\n\n\ndef calculate_distance2(row):\n # approximate radius of earth in km\n R = 6373.0\n\n lat1 = radians(row['lat'])\n lon1 = radians(row['long'])\n lat2 = radians(row['merch_lat'])\n lon2 = radians(row['merch_long'])\n\n dlon = lon2 - lon1\n dlat = lat2 - lat1\n\n a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2\n c = 2 * atan2(sqrt(a), sqrt(1 - a))\n\n distance = R * c\n print(distance)\n\n\n# First derive columns\nif 'trans_date_trans_time' in fraud_df.columns:\n fraud_df['txn_datetime'] = pd.to_datetime(\n fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')\n\nif 'dob' in fraud_df.columns:\n fraud_df['age'] = [calculate_age(d) for d in pd.to_datetime(\n fraud_df['dob'], format='%Y-%m-%d')]\n\nif set(['lat', 'long', 'merch_lat', 'merch_long']).issubset(set(fraud_df.columns)):\n fraud_df['distance'] = [calculate_distance(\n row) for _, row in fraud_df.iterrows()]\n\nfraud_df['hour'] = fraud_df['txn_datetime'].dt.hour\nfraud_df['day'] = fraud_df['txn_datetime'].dt.day\nfraud_df['month'] = fraud_df['txn_datetime'].dt.month\nfraud_df['year'] = fraud_df['txn_datetime'].dt.year\n\nfraud_df['log_amt'] = [log(n) for n in fraud_df['amt']]\n\n#%%\n\n# Saving the new data frame since it take a while to create the new ones before after do the above steps and before the columns are dropped.\nfraud_df.to_csv('grp5_fraud_mod.csv')\n\n#%%\n\n# Read Modified Fraud File here to skip the feature generation...which can take a while\nfraud_df = pd.read_csv(\"grp5_fraud_mod.csv\")\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# Drop the columns used to derive new features\nfraud_df.drop([\n 'Unnamed: 0',\n 'Unnamed: 0.1',\n 'trans_num',\n 'trans_date_trans_time',\n 'cc_num',\n 'merchant',\n 'unix_time',\n 'first',\n 'last',\n 'street',\n 'city',\n 'state',\n 'zip',\n 'job',\n 'dob',\n 'lat',\n 'long',\n 'merch_lat',\n 'merch_long'\n], axis=1, errors='ignore', inplace=True)\n\n\ncols = list(fraud_df.columns)\ncols.insert(0, cols.pop(cols.index(\"txn_datetime\")))\ncols.append(cols.pop(cols.index('is_fraud')))\nfraud_df = fraud_df[cols]\n\n#%%\n\nqgrid.show_grid(fraud_df.head(100), grid_options={\n 'forceFitColumns': False, 'defaultColumnWidth': 100})\n\n\n# ### Encoding\n\n#%%\n\n# create a lis to hold our categorical columns and one to hold our numerical columns\ncat_col = ['category', 'gender', ]\n# this is for the linear regression set and does not include amount - if we end up wanting to predict something else\n# we would just need replace the log_amt column with something else\nnum_col = ['age', 'distance', 'year', 'month', 'day', 'hour', 'city_pop']\n\n# this is for the logistic regression model where we are predicitng is_fraud\n#num_col = ['log_amt','age','distance','year','month','day','hour','city_pop']\n\n# can chane if we want something else #for linear regression model\nfraud_df['Target'] = fraud_df['log_amt']\n# fraud_df['Target'] = fraud_df['is_fraud'] #for logistic regression model\n\n#%%\n\n# This one hot encodes the categorical columns and create a new variable to hold the nex column headers\nif len(cat_col) > 0:\n cat_onehotencode = pd.get_dummies(fraud_df[cat_col], drop_first=True)\n cat_col_onehotencode = list(cat_onehotencode.columns)\n fraud_df = pd.concat([fraud_df, cat_onehotencode], axis=1)\n\nelse:\n cat_col_onehotencode = []\n\n#%%\n\nfraud_df[cat_col_onehotencode].head()\n\n#%%\n\nfraud_df = fraud_df.drop(['category', 'gender'], axis=1)\n\n#%%\n\n# Count all the columns and put them togeather into one variable for easy tracking\n# does not include the log_amt column.\nprint('Total number of features: {}'.format(\n len(num_col + cat_col_onehotencode)))\nprint('Numerical Features: {}'.format(len(num_col)))\nprint('Categorical Features: {}'.format(len(cat_col_onehotencode)))\ninput_col = (num_col + cat_col_onehotencode)\n\n\n# ## Data Transformation\n\n# ### Standardize the data\n\n#%%\n\nfraud_df.head()\n\n#%%\n\n# fraud_df.columns\n\n#%%\n\nfeatures = ['amt', 'city_pop', 'age', 'distance', 'hour', 'day',\n 'month', 'year', 'log_amt', 'Target', 'category_food_dining',\n 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos',\n 'category_health_fitness', 'category_home', 'category_kids_pets',\n 'category_misc_net', 'category_misc_pos', 'category_personal_care',\n 'category_shopping_net', 'category_shopping_pos', 'category_travel',\n 'gender_M']\ntarget = ['is_fraud']\n# Separating out the features\nx = fraud_df.loc[:, features].values\n# Separating out the target\ny = fraud_df.loc[:, target].values\n# Standardizing the features\nx = StandardScaler().fit_transform(x)\n\n\n# ### Principal Component Analysis\n#\n# Dimensionality reduction is used in machine learning: to combat computational cost, to control overfitting, and to visualize and help interpret high dimensional data sets.\n#\n# PCA is a statistical method that creates new features or characteristics of data by analyzing the characteristics of the dataset. Essentially, the characteristics of the data are summarized or combined together. You can also conceive of Principal Component Analysis as \"squishing\" data down into just a few dimensions from much higher dimensions space.\n\n# First, we get the list of features and plot which features have the most explanatory power, or have the most variance. It looks like around 22 or 23 of the features explain the majority of our data.\n\n#%%\n\npca = PCA()\npca.fit_transform(x)\npca_variance = pca.explained_variance_\n\nplt.figure(figsize=(8, 6))\nplt.bar(range(24), pca_variance, alpha=0.5,\n align='center', label='individual variance')\nplt.legend()\nplt.ylabel('Variance ratio')\nplt.xlabel('Principal components')\nplt.show()\n\n\n# Then, Let's convert the features into the 2 top features. We'll plot a scatter plot of the data point classification based on these 2 features.\n\n#%%\n\npca = PCA(n_components=2)\nprincipalComponents = pca.fit_transform(x)\nprincipal_df = pd.DataFrame(data=principalComponents,\n columns=['principal component 1', 'principal component 2'])\nprincipal_df\n\n#%%\n\nfraud = pd.DataFrame(data=y, columns=['is_fraud'])\nfinal_df = pd.concat([principal_df, fraud[['is_fraud']]], axis=1)\nfinal_df\n\n#%%\n\nfig = plt.figure(figsize=(8, 8))\nax = fig.add_subplot(1, 1, 1)\nax.set_xlabel('Principal Component 1', fontsize=15)\nax.set_ylabel('Principal Component 2', fontsize=15)\nax.set_title('2 component PCA', fontsize=20)\ntargets = [1, 0]\ncolors = ['r', 'b']\nfor target, color in zip(targets, colors):\n indicesToKeep = final_df['is_fraud'] == target\n ax.scatter(final_df.loc[indicesToKeep, 'principal component 1'],\n final_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50)\nax.legend(targets)\nax.grid()\n\n\n# ### Data splitting\n\n#%%\n\n# build Training/Validation/Test Samples\ndef Train_Valid_Test_Split(df, seed, tr):\n # First Shuffle the data\n df = df.sample(n=len(df), random_state=seed)\n df = df.reset_index(drop=True)\n\n # Splits Training Data = tr% - Validation & Test = (1-tr)/2\n # First take (1-tr) for Validation and Test\n valid_test = df.sample(frac=(1-tr), random_state=seed)\n\n # Then Split the validation and test data in half\n valid = valid_test.sample(frac=0.50, random_state=seed)\n test = valid_test.drop(valid.index)\n\n # The remodeling data is use for training data\n train = df.drop(valid_test.index)\n\n return train, valid, test\n\n\n# I used a 60% split for trian, 20% for vaild and 20% for Test because that is what the proff recommended.\ntrain, valid, test = Train_Valid_Test_Split(\n fraud_df[input_col + ['Target']], 12, .7) # (dataframe,randomseed,train split)\n\n#%%\n\n# check to make sure all samples are accounted for and distribution of continous variable matches across data sets\nplt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')\nplt.hist(train['Target'], alpha=0.5, label='Train', density=True)\nplt.hist(valid['Target'], alpha=0.5, label='Valid', density=True)\nplt.hist(test['Target'], alpha=0.5, label='Test', density=True)\nplt.legend(loc='upper right')\nplt.title = ('log(amt) Distribution in Split Datasets')\nplt.xlabel('log(amt)')\nplt.ylabel('Fraud Frequency')\nplt.show()\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\n\n#%%\n\ndef calc_prevalence(y_actual):\n # this function calculates the prevalence of the positive class (label = 1)\n return (sum(y_actual)/len(y_actual))\n\n\n# check to make sure all samples are accounted for and prevelence of classification target variable matches across data sets\nprint('Target checks:')\nprint('Test prevalence(n = {:0,d}):{:.3f}'.format(\n len(test), calc_prevalence(test.Target.values)))\nprint('Valid prevalence(n = {:0,d}):{:.3f}'.format(\n len(valid), calc_prevalence(valid.Target.values)))\nprint('Train all prevalence(n = {:0,d}):{:.3f}'.format(\n len(train), calc_prevalence(train.Target.values)))\n\n# check all records are accounted for\nprint('All samples (n = {:0,d}) accounted for.'.format(len(fraud_df)))\nassert len(fraud_df) == (len(test)+len(valid)+len(train)), 'math didnt work'\nprint('For continous variables the prevelence is actually the average of the target variable')\n\n#%%\n\n# change data sets into maxtrix objects for the models\nX_train = train[input_col].values\nX_valid = valid[input_col].values\nX_test = test[input_col].values\n\ny_train = train['Target'].values\ny_valid = valid['Target'].values\ny_test = test['Target'].values\n\n#%%\n\nprint(X_train.shape)\nprint(X_valid.shape)\nprint(X_test.shape)\nprint(y_train.shape)\nprint(y_valid.shape)\nprint(y_test.shape)\n\n\n# ### Resampling (SMOTE)\n\n# Resampling methods are designed to add or remove examples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.\n#\n# Here, we use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1 and use undersampling to delete or merge examples in the majority class which is when is_default = 0.\n#\n# SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why we apply SMOTE after data splitting.\n\n# First, create a synthetic binary classification dataset with 12,000 examples and a 1:100 class distribution. Then we summarize the number of examples in each class to confirm the dataset was created correctly by using Counter(). Next, we oversample the minority class to have 20 percent the number of examples of the majority class, then use random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class. Finally, we can create a scatter plot of the dataset and color the examples for each class a different color to clearly see the spatial nature of the class imbalance.\n\n#%%\n\n# Oversample with SMOTE and random undersample for imbalanced dataset\n# Define dataset\nX_train, y_train = make_classification(n_samples=12000, n_features=2, n_redundant=0, n_clusters_per_class=1,\n weights=[0.99], flip_y=0, random_state=1)\n\n# Summarize class distribution\ncounter = Counter(y_train)\nprint(counter)\n\n# Define pipeline\nover = SMOTE(sampling_strategy=0.2)\nunder = RandomUnderSampler(sampling_strategy=0.5)\nsteps = [('o', over), ('u', under)]\npipeline = Pipeline(steps=steps)\n\n# Transform the dataset\nX_train, y_train = pipeline.fit_resample(X_train, y_train)", "original_comment": "# Summarize the new class distribution\n", "target_code": "counter = Counter(y_train)\n", "project_metadata": {"full_name": "cwperks/eai6000_group5", "description": null, "topics": [], "git_url": "git://github.com/cwperks/eai6000_group5.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-10-29T00:18:52Z", "size": 42099, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 31550356, "HTML": 1870212, "Python": 15882}, "last_updated": "2020-12-07T04:23:48Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "print(X_train.shape)\nprint(y_train.shape)\n", "model": "no-comments", "intent": "# Summarize the new class distribution"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's mount our G-Drive.\n\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n#%%\n\n# # Create the Workspace folder\nget_ipython().system(\"mkdir -p '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# # Clone the repository\nget_ipython().system(\"git clone 'https://github.com/amitbcp/icdmai_2020.git' '/content/drive/My Drive/ICDMAI_Tutorial/notebook/'\")\n\n# Download the Data\nget_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1XTkF0yANUPIZ3SXw5BbbWKCkWo7ZK56T'\")\n\n# Unzip the Data\nget_ipython().system(\"unzip 'ICDMAI_Tutorial.zip' -d '/content/drive/My Drive/'\")\n\n# Annnd You are ready to Go !\n\n\n# ## Other Artificats\n#\n# To run End-to-End pipeline you would require more data & space in G-Drive. If you are sure of it go-forward & run the cells below.\n\n#%%", "original_comment": "# Download the raw-data files\n", "target_code": "get_ipython().system(\n \"gdown 'https://drive.google.com/uc?id=1gkgUlkaRXUzrNR_jY42ieK4xtLX3ztKX'\")\nget_ipython().system(\n \"unzip 'raw_data_files.zip' -d '/content/drive/My Drive/ICDMAI_Tutorial/'\")\n", "project_metadata": {"full_name": "amitbcp/icdmai_2020", "description": "This repository is for the Session held in International Conference on Data Management, Analytics and Innovation, New Delhi 2020", "topics": ["deeplearning", "recurrent-neural-networks", "rnn-pytorch", "word-embeddings", "text-classification", "rnns", "notebooks", "stackoverflow", "tag-recommender", "recommendation-system", "svm", "onevsrest"], "git_url": "git://github.com/amitbcp/icdmai_2020.git", "stars": 7, "watchers": 7, "forks": 2, "created": "2020-01-04T04:42:01Z", "size": 13078, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2676004}, "last_updated": "2021-01-06T14:44:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "get_ipython().system(\n \"wget 'https://raw.githubusercontent.com/amitbcp/ICDMAI_Tutorial/master/data/ICDMAI_Tutorial.zip'\")\n", "model": "natural", "intent": "# Download the raw-data files"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata.isnull().sum()\n\n\ndata.dropna(inplace=True)\n\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\nimport plotly.express as px\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n#%%\n\ndata.tail()\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata.isnull().sum()\n\n#%%\n\ndata.dropna(inplace=True)\n\n#%%\n\ndata_num = data[['inning', 'balls', 'strikes', 'outs', 'probCalledStrike', 'releaseVelocity', 'spinRate', 'spinDir', 'locationVert', 'movementHoriz', 'movementVert',\n 'battedBallAngle', 'battedBallDistance']]\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\ndata_scaled = pd.DataFrame(data=scale.fit_transform(\n data_num), columns=data_num.columns)\ndata_scaled.tail()\n\n#%%\n\npca = PCA(n_components=2)\npca_comps = pca.fit_transform(data_scaled)", "original_comment": "# ### Compute the explained variance for new data set.\n", "target_code": "pca.explained_variance_\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "var_exp = pca.explained_variance_ratio_\nvar_exp\n", "model": "natural", "intent": "# Compute the explained variance for new data set."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n\ndf.columns\n\n\ndf.describe()\n\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n\ndf.head()\n\n\ndf.isnull().any()\n\n\n\nvariables = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',\n 'Ticket', 'Fare', 'Embarked', 'Survived', 'Initial']\n# Calculate the correlations\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#

AI Winter School 2019

\n#

Dec 10 - Dec 20, 2019

\n#

Lab on Logistic Regression

\n#\n#\n# > *This notebook is part of the AI Winter School 2019 organized by NAAMII. The objectives of this notebook is to use Logistic Regression to predict whether an individual survived or not during the sinking of the Titanic. The data used in this exercise is adopted from https://www.kaggle.com/c/titanic/data.*\n\n# # Table of Contents\n#\n# [Introduction](#introduction)\n#    [Explanation and Theory](#introduction-theory)\n#    [Considerations](#introduction-considerations)\n# [Data Setup](#setup)\n#    [Meta data](#variables)\n#    [Correlation](#correlation)\n#    [Dummy variables for Categorical Data](#dummy)\n# [Scikit-Learn Logistic Regression](#sklearn-logreg)\n# [Summary](#summary)\n\n# # Logistic Regression Classifier Introduction \n#\n# Logistic regression is one of the most simple and well-known machine learning algorithms for classification. Despite its name, it is used for classification rather than regression.\n#\n# In basic terms, it predicts the probability of occurrence of an event by fitting the data to a logistic function. This probability is then translated into a class label based on the set threshold of the function.\n\n# ## Explanation and Theory \n#\n# **Assumptions and properties.**\n# Suppose we have a data set that consists of n samples and m features\n#\n# \\begin{equation}\n# \\label{eqn:samples}\n# \\mathbf{X} = \\{\\mathbf{x_1}, \\mathbf{x_2}, \\ldots \\mathbf{x_n}\\}, \\quad \\mathbf{x_i} \\in \\mathbb{R}^{m}\n# \\end{equation}\n#\n# where\n#\n# \\begin{equation}\n# \\label{eqn:features}\n# \\mathbf{x_i} = (x_i^{(1)}, x_i^{(2)}, \\ldots x_i^{(m)} ) ^T\n# \\end{equation}\n#\n# The target variable is the probability of a sample belonging to a certain class and is represented by\n# \\begin{equation}\n# \\label{eqn:target}\n# \\mathbf{Y}= \\{y_1, y_2 \\ldots y_n \\} \\quad where \\,\\,\\, y_i \\in (0,1)\n# \\end{equation}\n#\n# Let us assume our problem is a binary classification problem, meaning the response/dependent variables has two classes or labels 0 and 1. If we used linear regression, it would give us a straight line that best separates 0 and 1 responses. However, we could not use this line to give us a probability, since it would give us a negative value for the responses near zero on the x-axis. Instead, when we use logistic regression, we fit this data and estimate the target variable using the following **logistic/sigmoid function**:\n#\n# \\begin{equation}\n# \\label{eqn:sigmoid}\n# y_i =\\frac {e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})}} {(1 + e^{(\\beta_0 + \\mathbf{\\beta} \\cdot \\mathbf{x_i})})}\n# \\end{equation}\n# where $\\beta_0$ is called the bias term or the intercept, and $\\mathbf{\\beta}$ are the coefficients associated with the feature vector $\\mathbf{x_i}$.\n# \"logistic\n#\n# The function transforms all input variables to the range [0,1], which brings the smallest or most negative numbers close to zero and the largest positive numbers close to one. This allows us to take real-valued inputs and output a probability of the input belonging to either class zero or one. We can then choose a threshold value, such as 0.5, and provide the class output.\n#\n# **Algorithm and Training.** Logistic regression takes the form of a linear model:\n#\n# $$f(i)=\\beta_0+\\beta_1x_{1,i}+...+\\beta_mx_{m,i} $$\n#\n# where $\\beta_0,...,\\beta_m$ are the regression coefficients or weights assigned to each feature $x$. For each data point $i$, a pseudo-variable $x_{0,i}=1$ is added to correspond to the intercept coefficient $\\beta_0$. This allows us to write the model in vector form as:\n# $$f(i)=\\boldsymbol{\\beta}\\cdot\\boldsymbol{X_i}$$\n#\n# When we train the logistic regression classifier, we are trying to find the best values of $\\beta$ to match the data. This is done using an estimation method that attempts to minimize the error of the model. There are several techniques to do this, such as [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent).\n\n# ## Considerations \n#\n# There are a few things to remember when using logistic regression as a classifier. First, it assumes that there is a linear relationship between the independent variables and the dependent variables. In high-dimensional datasets, this may not be the case, so logistic regression may not be the best choice of classifier.\n#\n# Logisitic regression is also sensitive to highly correlated inputs. Having highly correlated inputs can cause the model to be overfit or will cause the model to fail to converge. We will take a closer look at the correlations between different variables in this notebook.\n\n# # Data setup \n\n#\n# ![alt text](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/RMS_Titanic_3.jpg/1280px-RMS_Titanic_3.jpg)\n#\n# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.\n#\n# One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.\n#\n# Our goal is to use predict if an individual survived or not in the titanic ship wreck.\n#\n\n#%%\n\n# All our imports\nimport seaborn as sns\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import confusion_matrix\nfrom matplotlib import pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\n# Run this only if you are using Google Colab\n# from google.colab import drive\n# drive.mount('/content/drive')\n\n#%%\n\n# Enter the path of your file inside the quotes\npath = \"titanic_clean.csv\"\n\n#%%\n\n# Write the code to read the csv file to a dataframe df\ndf = pd.read_csv(path)\n\n#%%\n\ndf.columns\n\n#%%\n\ndf.describe()\n\n#%%\n\ndf.head()\n\n\n# # Variable Metadata \n# **Pclass:** A proxy for socio-economic status (SES)\n#\n# 1 = Upper\n#\n# 2 = Middle\n#\n# 3 = Lower\n#\n# **Age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5\n#\n# **SibSp:** The dataset defines family relations in this way:\n#\n# Sibling = brother, sister, stepbrother, stepsister\n#\n# Spouse = husband, wife (mistresses and fianc\u00e9s were ignored)\n#\n#\n# **Parch:** The dataset defines family relations in this way:\n# Parent = mother, father\n# Child = daughter, son, stepdaughter, stepson\n# Some children travelled only with a nanny, therefore parch=0 for them.\n#\n# **Embarked:** The port from where the particular passenger was embarked/boarded.\n#\n# **Survived:**\n# 0: if the person did not survive\n# 1: if the person survived\n\n#%%\n\ndf.head()\n\n#%%\n\ndf.isnull().any()", "original_comment": "# # Correlation between variables\n", "target_code": "corr_mat = df[variables].corr().round(2)\n", "project_metadata": {"full_name": "ghimireadarsh/AI-WinterSchool", "description": "Comprises of various lecture slides, papers, practical notebooks used during AI Winter school, organized by NAAMII at Pokhara, Nepal from December 10, 2019 to December 20, 2019. ", "topics": [], "git_url": "git://github.com/ghimireadarsh/AI-WinterSchool.git", "stars": 6, "watchers": 6, "forks": 6, "created": "2019-12-14T18:16:09Z", "size": 75918, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1041087, "HTML": 666537, "Python": 20395}, "last_updated": "2020-09-27T21:32:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.figure(figsize=(10, 10))\nsns.heatmap(df.corr(), annot=True)\n", "model": "natural", "intent": "# Correlation between variables"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)", "original_comment": "# Add the title\n", "target_code": "plt.title('Fuel efficiency vs Horse-power')\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3, "coverage": "Agree", "coverage-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df.plot(kind='scatter', x='hp', y='mpg', s=sizes)\nplt.title('Scatter Plot')\n", "model": "natural", "intent": "# Add the title"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\n# Let's start with the basics\n\n\nsimple = list(range(1, 19))\nsimple\n\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n\n# Select the last item using positive indexation\nsimple[17]\n\n\n# Select the last item using negative indexation\nsimple[-1]\n\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n\n# What is the biggest number in the list?\nmax(simple)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\n# Let's start with the basics\n\n#%%\n\nsimple = list(range(1, 19))\nsimple\n\n#%%\n\n# Select the 1st item using positive indexation\nsimple[0]\n\n#%%\n\n# Select the 1st item using negative indexation\nsimple[-18]\n\n#%%\n\n# Select the last item using positive indexation\nsimple[17]\n\n#%%\n\n# Select the last item using negative indexation\nsimple[-1]\n\n#%%\n\n# Select a range of items with positive indexation\nsimple[0:7]\n\n#%%\n\n# Select a range of items with negative indexation\nsimple[-18:-11]\n\n#%%\n\n# Select a range of items between 1 and 7 in increments of 2\nsimple[1:7:2]\n\n#%%\n\n# Select the same range of items between 1 and 7 in increments of -2 (backwards)\nsimple[-13:-18:-2]\n\n#%%\n\n# Note how the step increment makes a difference to the order - this doesn't work because it says start at 1,\n# go on until 7 and use increments of negative 2 but if we do negative 2 from 1 we get immediately outside\n# the bounds of our list\nsimple[1:7:-2]\n\n#%%\n\n# Similarly here we are saying start at -18 and go forwards by 2 which again puts us immediately\n# outside the bounds of our list\nsimple[-13:-18:2]\n\n#%%\n\n# Now replace a list item with a new value (6 > 99)\nsimple[-13] = 99\n\n#%%\n\n# And check what it looks like now\nsimple[-13:-18:-2]\n\n#%%\n\n# Add a number at the end of the list\nsimple.append(909)\nsimple\n\n#%%\n\n# Add a number in the middle of the list (add number 6 just before position 5)\nsimple.insert(5, 6)\nsimple\n\n#%%\n\n# Quickly check if a number is somewhere in the list\n99 in simple\n\n#%%\n\n# And then check which index position it occurs in the list\nsimple.index(99)\n\n#%%\n\n# What is the biggest number in the list?\nmax(simple)\n\n#%%", "original_comment": "# And the smallest?\n", "target_code": "min(simple)\n", "project_metadata": {"full_name": "shotleft/how-to-python", "description": null, "topics": [], "git_url": "git://github.com/shotleft/how-to-python.git", "stars": 11, "watchers": 11, "forks": 4, "created": "2018-05-03T04:32:17Z", "size": 3364, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2974562}, "last_updated": "2020-12-05T20:07:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "smallest = min(simple)\nsmallest\n", "model": "docstring", "intent": "# And the smallest?"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Trade\n# This notebook trades with the hyperparameters selected in the previous over the full set of trading data.\n#\n# Note that in reality the byperparameter tunning could be repeated every trading day but we will ignore that complication.\n\n# #### Import necessary modules\n\n#%%\n\nimport seaborn as sns\nfrom time import sleep\nimport shutil\nfrom sagemaker.session import Session\nfrom sagemaker.estimator import Estimator\nfrom sagemaker import get_execution_role\nimport pandas as pd\nimport numpy as np\nimport math\nfrom IPython.display import Image\nimport boto3\nimport matplotlib.pyplot as plt\nimport matplotlib\nget_ipython().run_line_magic('matplotlib', 'inline')\n\nsns.set()\n\n\n# #### Set for local parameters\n\n#%%\n\nn_instances = 1\n#image_type = 'cpu'\n#instance_type = 'ml.m5.large'\n\nimage_type = 'gpu'\ninstance_type = 'ml.g4dn.xlarge'\n\nhyperparameters = {'prices_name': 'synthetic-prices-2019.csv',\n 'signals_name': 'signals-2019.csv',\n 'start_day': 2670,\n 'days_per_epoch': 40,\n 'fc1': 13,\n 'fc2': 0,\n 'lr_actor': 0.00039,\n 'lr_critic': 0.00356}\n\ntrain_use_spot_instances = True\ntrain_max_run = 14400\ntrain_max_wait = 14400 if train_use_spot_instances else None\n\nsagemaker_session = Session()\nbucket_name = sagemaker_session.default_bucket()\nrole = get_execution_role()\naccount = boto3.client('sts').get_caller_identity()['Account']\nregion = boto3.Session().region_name\nimage_name = '{}.dkr.ecr.{}.amazonaws.com/portfolio-optimization-{}:latest'.format(\n account, region, image_type)", "original_comment": "# #### Submit the job\n", "target_code": "from sagemaker.estimator import Estimator\n\nestimator = Estimator(role=role,\n train_instance_count=n_instances,\n train_instance_type=instance_type,\n image_name=image_name,\n hyperparameters=hyperparameters)\nestimator.fit()\n", "project_metadata": {"full_name": "daniel-fudge/DRL-Portfolio-Optimization-Custom", "description": "A portfolio optimization framework leveraging Deep Reinforcement Learning (DRL) and a custom trading environment", "topics": [], "git_url": "git://github.com/daniel-fudge/DRL-Portfolio-Optimization-Custom.git", "stars": 3, "watchers": 3, "forks": 1, "created": "2020-06-12T22:27:29Z", "size": 35064, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1170339, "Python": 39958, "Shell": 4637}, "last_updated": "2020-11-01T22:06:49Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "sagemaker = sagemaker.estimator.Estimator(image_name,\n role,\n train_instance_count=1,\n train_instance_type=instance_type,\n output_path='s3://{}/{}/output'.format(\n bucket_name, prefix),\n sagemaker_session=sagemaker_session)\nsagemaker.set_hyperparameters(**hyperparameters)\n", "model": "docstring", "intent": "# Submit the job"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Observational Realism Suite\n#\n# ## Examples\n\n# ### Example 1: SDSS statistical observational realism in gri bands\n#\n# In this example, you will use the use the statistical observational realism described in Bottrell et al (2017a) and made public in Bottrell et al (2019b) to insert a handful of galaxy images quasi-randomly into real SDSS fields. The inputs (found in the Inputs directory) are synthetic idealized photometry images. These images are noiseless with an extraordinarily high resolution spatial resolution (97 pc/pixel). The final image is in AB nanomaggies, a calibrated flux unit. It includes real sky, real PSF degradation, and contamination by additional sources in the field of view. Most importantly, the statistics for these properties match those for real SDSS galaxies. All of the information about the fields in which these images are inserted is included in the image headers, along with all of the user-defined specifics.\n#\n# The images are generated from the G2G3e orbit1 merger from the Moreno et al (2019) merger suite. The images were produced from SKIRT datacubes generated by Maan Hani (University of Victoria). We use the quantitative morphologies catalog of Simard et al (2011) as the basis catalog for the insertion statistics.\n#\n#\n#\n\n#%%\n\nfrom astropy.visualization import make_lupton_rgb\nimport matplotlib.pyplot as plt\nfrom SpecToSDSS_gri import *\nfrom glob import glob\nfrom ObsRealism import *\nfrom astropy.io import fits\nimport numpy as np\nimport os\nimport sys\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\nif useSQL:\n import pymysql\n table = 'sdss_dr7_morph_mybkg_mydeblend_gr'\n db = pymysql.connect(host='localhost', db='sdss', password='your_pass')\n c = db.cursor()\n dbcmd = ['SELECT run,rerun,camcol,field',\n 'FROM {}'.format(table)]\n c.execute(' '.join(dbcmd))\n field_info = np.asarray(c.fetchall()).astype(int)\n c.close()\n db.close()\nelse:\n field_info = np.load('Sources/Simard2011_Field_Info.npy')\n\n# '''\n# These common args adopt a redshift of z=0.046 (only used to determine the\n# physical to angular scale [kpc/arcsec]). With rebin_to_CCD=True, the input\n# image is rebinned to a CCD scale set by 'CCD_scale' (which in this case is\n# the 0.396 arcsec/pixel of the SDSS camera. The images are added to real\n# image fields and incorporate a reconstruction of the real SDSS PSF. Poisson\n# noise is added to the image.\n# '''\n\ncommon_args = {\n 'redshift': 0.05, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': False, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': False, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': True,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': True,\n}\n\n# get image list for r-band images (will reformat for other bands in loop)\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\n# bands in which to create images\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'FullReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # draw SDSS field and select insertion point\n sdss_args = make_sdss_args(field_info)\n # loop over each band\n for band in bands:\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band,\n common_args=common_args, sdss_args=sdss_args)\n\n\n# ### Example 2: SemiReal synthetic images\n#\n# Using an approach similar to the one in the last example, you will add realistic Gaussian skies and Gaussian PSF to the images but not insert into real image fields. You can modify the default properties as you like to emulate observations with various instruments (CCD scales, sky noise levels, PSF sizes, etc). These sky noise levels are given in relative AB magnitude surface brightness units (AB mag/arcsec2) and the PSF are given in arcsec. Currently, the values that are drawn are independent in each band. Suggestions are welcome on how to better correlate them (mail: cbottrel \"at\" uvic \"dot\" ca).\n\n#%%\n\n# base path to input images\nimg_base_path = 'Inputs/'\n# configuration path (SExtractor params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/'\n# application path (read_PSF, read_atlas, etc.)\nsdss_app_path = 'Sources/utils/sdss-apps/'\n# output directory path\noutput_type = 'FullReal'\noutput_path = 'Outputs/'\nuseSQL = False\n\ncommon_args = {\n 'redshift': 0.046, # mock observation redshift\n 'rebin_to_CCD': True, # rebin to CCD angular scale\n 'CCD_scale': 0.396, # CCD angular scale in [arcsec/pixel]\n 'add_false_sky': True, # add gaussian sky\n # gaussian sky standard dev [AB mag/arcsec2]\n 'false_sky_sig': 24.2,\n 'add_false_psf': True, # convolve with gaussian psf\n 'false_psf_fwhm': 1.1, # gaussian psf FWHM [arcsec]\n 'add_poisson': True, # add poisson noise to galaxy\n # insert into real SDSS sky (using sdss_args)\n 'add_sdss_sky': False,\n # convolve with real SDSS psf (using sdss_args)\n 'add_sdss_psf': False,\n}\n\n# statistics on sky noise (obtained from averages over all Legacy galaxies)\nskySig = {'u': 23.872, 'g': 24.880, 'r': 24.384, 'i': 23.820, 'z': 22.356}\n# standard deviation in sky noise (sky noise level is drawn from this distribution)\nSigskySig = {'u': 0.147, 'g': 0.137, 'r': 0.109, 'i': 0.119, 'z': 0.189}\n# statistics on seeing (obtained from averages over all Legacy galaxies)\nseeing = {'u': 1.551, 'g': 1.469, 'r': 1.356, 'i': 1.286, 'z': 1.308}\n# standard deviation in seeing (seeing is drawn from this distribution)\nSigseeing = {'u': 0.243, 'g': 0.221, 'r': 0.221, 'i': 0.222, 'z': 0.204}\n\nimgList = list(sorted(glob(img_base_path+'/photo_r_CNN*.fits')))\nbands = ['g', 'r', 'i']\n\nfor _imgName in imgList:\n # get redshift from FITS header\n common_args['redshift'] = fits.getheader(_imgName)['REDSHIFT']\n # set holder for output names\n _outName = _imgName.replace(img_base_path, output_path).replace(\n 'total.fits', 'SemiReal.fits')\n # skip image if output already exists\n if os.access(_outName, 0):\n continue\n # loop over each band\n for band in bands:\n # draw a random sky noise from the distribution of typical skies in SDSS\n common_args['false_sky_sig'] = np.random.normal(\n skySig[band], SigskySig[band])\n # draw a random PSF size from the distribution of typical PSF sizes SDSS\n common_args['false_psf_fwhm'] = np.random.normal(\n seeing[band], Sigseeing[band])\n imgName = _imgName.replace('photo_r', 'photo_{}'.format(band))\n outName = _outName.replace('photo_r', 'photo_{}'.format(band))\n ObsRealism(imgName, outName, band=band, common_args=common_args)\n\n\n# ## Suggestions\n#\n# If you have any suggestions or requests to improve or broaden this suite, please contact me.\n\n# ## Generating input\n#\n# I provide a standalone code (SpecToSDSS_gri.py) for generating idealized photometry in AB calibrated surface brightnesses from SKIRT datacubes. This code conveniently generates output that is in correct format for the realism suite. To run this example, you must first download the SKIRT datacube here: http://orca.phys.uvic.ca/~cbottrell/share/Realism/spec_G2G3_e-orbit_1_320_i0_total.fits and place it in the Inputs/Datacubes/ directory.\n\n#%%\n\n# base path to SKIRT datacubes\nifu_base_path = 'Inputs/Datacubes/'\n# configuration path (SEx params, gim2d files, etc.)\nsdss_cfg_path = 'Sources/utils/sdss-cfg/SDSS_Photometry/'\n# target redshift\nredshift = 0.046\n\n# list of SKIRT datacubes\nifuList = list(sorted(glob(ifu_base_path+'spec*.fits')))\n# wavelength list from SKIRT\nwl_filename = sdss_cfg_path+'SDSS_gri3_wavelength_grid.dat'\n# bands in which to produce photometry\nbands = ['u', 'g', 'r', 'i']\n\nfor ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('r'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n\n\n# ## Plotting output\n#\n# ### Example 1: \"Idealized\" photometry images\n#\n# This example uses the Lupton_rgb package to make SDSS gri colour composite images from the idealized images in the Input directory or generated from SKIRT datacubes. Colours can be adjusted by changing their relative contributions manually before passing to the lupton_rgb method.\n#\n# See the docs for Lupton RGB for details: https://docs.astropy.org/en/stable/visualization/rgb.html\n\n#%%", "original_comment": "# get list of r-band images\n", "target_code": "imgList_r = list(glob('Inputs/photo_r_CNN*.fits'))\n", "project_metadata": {"full_name": "cbottrell/RealSim", "description": "RealSim is the statistical observational realism suite described in Bottrell et al 2017ab and made public in Bottrell et al 2019b.", "topics": [], "git_url": "git://github.com/cbottrell/RealSim.git", "stars": 5, "watchers": 5, "forks": 1, "created": "2019-07-10T21:26:45Z", "size": 20047, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2416365, "C": 294600, "Python": 34394, "Makefile": 4159, "Tcl": 1042, "Shell": 374, "C++": 88}, "last_updated": "2020-05-29T13:33:55Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "for ifuName in ifuList:\n _outputName = ifuName.replace(\n ifu_base_path, 'Inputs/').replace('spec_', 'photo_{}_CNN_')\n if os.access(_outputName.format('g'), 0):\n continue\n SpecToSDSS_gri(ifuName, _outputName, wl_filename,\n sdss_cfg_path, bands=bands, redshift=redshift)\n", "model": "docstring", "intent": "# get list of r-band images"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n\ndata.dtypes\n\n\ndata.size\n\n\ndata.info()\n\n\ndata['RAM'].describe()\n\n\ndata.describe()\n\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n\ndf.info()\n\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n\nlen(df)\n\n\ndf.fillna(np.nan)\ndf\n\n\ndf_dropped = df.dropna()\ndf_dropped\n\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n\ndf.head(2)\n\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n\n# checking unique values\ndfm['GPRS'].unique()\n\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n\n# checking data types\ndfm.dtypes\n\n\n# count of every column\ndfm.count()\n\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n\ndf1 = df_dropped\nlen(df1)\n\n\n# checking the datatypes\ndf1.dtypes\n\n\n# displaying info\ndf1.info()\n\n\ndf1.head()\n\n\ndf1.tail()\n\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # DAML Runtime Error\n\n# # Dataset Work\n\n# ## Data Collection and Data Cleaning\n\n#%%\n\n# Import Packages\nfrom tkinter import *\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.svm import LinearSVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score\nfrom scipy import stats\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport math\nimport pandas as pd\nimport numpy as np\n\n\n# ## Load the data\n\n#%%\n\ndata = pd.read_csv('datasets/phone_dataset.csv')\n\n\n# ## Describe the data in multiple- way\n\n#%%\n\ndata.dtypes\n\n#%%\n\ndata.size\n\n#%%\n\ndata.info()\n\n#%%\n\ndata['RAM'].describe()\n\n#%%\n\ndata.describe()\n\n#%%\n\ndata.head(2)\n\n\n# ## Remove Insignificant columns\n\n#%%\n\ndf = data.drop(labels=['weight_oz', 'brand', 'model', 'network_technology', '2G_bands', 'network_speed', 'announced', 'status', 'display_type', 'OS',\n 'Chipset', 'GPU', 'memory_card', 'loud_speaker', 'audio_jack', 'WLAN', 'bluetooth', 'GPS', 'NFC', 'radio', 'USB', 'sensors', 'colors'], axis='columns')\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(2)\n\n\n# ## Check Missing value for each columns\n\n#%%\n\ndf.isnull().sum()\n\n\n# ## Remove rows with missing values\n\n#%%\n\nlen(df)\n\n#%%\n\ndf.fillna(np.nan)\ndf\n\n#%%\n\ndf_dropped = df.dropna()\ndf_dropped\n\n#%%\n\nlen(df_dropped)\n\n\n# ## Formatting your Data \u2013 making data types compatible with other data types.\n\n#%%\n\ndf.head(2)\n\n#%%\n\ndfm = df.copy()\n\n\n# ### Filling Null values with nan\n\n#%%\n\ndfm['GPRS'].fillna(0, inplace=True)\ndfm['EDGE'].fillna(0, inplace=True)\ndfm['3G_bands'].fillna(0, inplace=True)\ndfm['4G_bands'].fillna(0, inplace=True)\ndfm\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['GPRS'] = dfm['GPRS'].fillna(0)\ndfm['EDGE'] = dfm['EDGE'].fillna(0)\ndfm.loc[dfm['GPRS'] == 'No', 'GPRS'] = 0\ndfm.loc[dfm['GPRS'] != 0, 'GPRS'] = 1\ndfm.loc[dfm['EDGE'] == 'No', 'EDGE'] = 0\ndfm.loc[dfm['EDGE'] != 0, 'EDGE'] = 1\ndfm[['GPRS', 'EDGE']] = dfm[['GPRS', 'EDGE']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking unique values\ndfm['GPRS'].unique()\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n\n# ### Changing the datatype of columns for compatibility\n\n#%%\n\ndfm['3G_bands'] = dfm['3G_bands'].fillna(0)\ndfm['4G_bands'] = dfm['4G_bands'].fillna(0)\ndfm.loc[dfm['3G_bands'] != 0, '3G_bands'] = 1\ndfm.loc[dfm['4G_bands'] != 0, '4G_bands'] = 1\ndfm[['3G_bands', '4G_bands']] = dfm[[\n '3G_bands', '4G_bands']].apply(pd.to_numeric)\ndfm\n\n#%%\n\n# checking data types\ndfm.dtypes\n\n#%%\n\n# count of every column\ndfm.count()\n\n#%%\n\n# diplaying the dataframe\ndfm\n\n\n# ### Removing insignificant rows\n\n#%%\n\ndfm = dfm[(dfm['3G_bands'] != 0)]\ndfm\n\n#%%\n\n# couting values of each column\ndfm.count()\n\n\n# ## Removing Rows with null values\n\n#%%\n\ndf_dropped = dfm.dropna()\ndf_dropped\n\n\n# ### Checking the length of the updated dataframe\n\n#%%\n\ndf1 = df_dropped\nlen(df1)\n\n#%%\n\n# checking the datatypes\ndf1.dtypes\n\n#%%\n\n# displaying info\ndf1.info()\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf1.tail()\n\n#%%\n\ndf1\n\n\n# ## Modifying the data of the column as per the requirement\n\n#%%\n\ndef modify_str(s):\n s1 = ''\n for i in s:\n if(i == ' '):\n break\n else:\n s1 += i\n return s1\n\n\nfor i, row in df1.iterrows():\n df1.at[i, 'display_size'] = modify_str(row['display_size'])\ndf1\n\n#%%", "original_comment": "# changing the datatype\n", "target_code": "df1[['display_size']] = df1[['display_size']].apply(pd.to_numeric)\n", "project_metadata": {"full_name": "yatinagg/Mobile_Price_Classification", "description": "Dataritz Phone Price Classification", "topics": [], "git_url": "git://github.com/yatinagg/Mobile_Price_Classification.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-05-18T15:04:18Z", "size": 6525, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4727409, "Python": 568}, "last_updated": "2020-08-30T08:37:02Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "df1['display_size'] = df1['display_size'].apply(pd.to_numeric)\n", "model": "natural", "intent": "# changing the datatype"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # IndoXTC - Extracting Toxic-EN Features [XLM-R] 2\n# Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n#\n# This kernel is a part of my undergraduate final year project.\n# Checkout the full github repository:\n# https://github.com/ilhamfp/indonesian-text-classification-multilingual\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nfrom load_data import load_dataset_foreign\nfrom extract_feature import FeatureExtractor\n\nSTART = 20000\nEND = 40000\n\n\n# ## Load Data\n\n#%%\n\ndata = load_dataset_foreign(data_name='toxic')\ndata_pos = data[data['label'] == 1].reset_index(drop=True)\ndata_neg = data[data['label'] == 0].reset_index(drop=True)\n\ntrain = pd.concat([data_pos[START:END],\n data_neg[START:END]]).reset_index(drop=True)\n\nprint(train.shape)\ntrain.head()", "original_comment": "# ## Extract Feature\n", "target_code": "from extract_feature import FeatureExtractor\n\nFE = FeatureExtractor(model_name='xlm-r')\n", "project_metadata": {"full_name": "ilhamfp/indonesian-text-classification-multilingual", "description": "Improving Indonesian text classification using multilingual language model", "topics": ["multilingual-language-model", "text-classification", "indonesian-language", "indonesian-text-classification", "sentiment-analysis", "hate-speech-detection", "language-model", "multilingual", "zero-shot", "monolingual", "cross-lingual-transfer", "multilingual-language-models", "indonesian-data", "english-language"], "git_url": "git://github.com/ilhamfp/indonesian-text-classification-multilingual.git", "stars": 7, "watchers": 7, "forks": 0, "created": "2020-04-26T07:27:39Z", "size": 15604, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 3476215, "Python": 28982}, "last_updated": "2020-12-20T17:12:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "X_train, X_test, y_train, y_test = train_test_split(\n train['comment_text'], train['label'], test_size=0.2, random_state=42)\n", "model": "no-comments", "intent": "# Extract Feature"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n\nleads_scoring.head(5)\n\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n\nround(leads_scoring.describe(), 2)\n\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n\nleads_scoring_model.info()\n\n\nleads_scoring_model.shape\n\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n\n\n\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n\n\n\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n\nX_train.describe()\n\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n\n\n\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n\nlogreg = LogisticRegression()\n\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n\n\n\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n\ncols = X_train.columns[rfe.support_]\n\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n\ncols\n\n\nX_train.shape\n\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n\n\n\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### Importing the required libraries to perform Logistic Regression\n\n#%%\n\n# import all the necessary libraries\n\nfrom sklearn.metrics import confusion_matrix\nfrom statsmodels.stats.outliers_influence import variance_inflation_factor\nfrom sklearn import metrics\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.feature_selection import RFECV\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nimport statsmodels.api as sm\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nimport warnings\nfrom sklearn.preprocessing import scale\nimport pandas as pd\nimport numpy as np\nimport pandas as pd\n\n# For Visualisation\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# To Scale our data\n\nwarnings.filterwarnings(\"ignore\")\n\n\n# ### Step 1: Reading and Understanding the Data\n\n#%%\n\nleads_scoring = pd.read_csv(\"./Leads.csv\")\n\n\n# ### Inspecting the data\n\n#%%\n\nleads_scoring.head(5)\n\n#%%\n\nleads_scoring.info()\n\n\n# #### Replacing the Select option from categorical variables as it is esentially just a null value\n\n#%%\n\nleads_scoring = leads_scoring.replace('Select', np.nan)\n\n#%%\n\nleads_scoring.describe()\n\n\n# #### Dropping duplicate records\n\n#%%\n\nleads_scoring.drop_duplicates(inplace=True)\n\n\n# **Missing values along rows**\n\n#%%\n\nleads_scoring.isnull().sum(axis=1)\n\n\n# **Missing values along columns**\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treatment of missing values\n\n# **Dropping collumn with 70% or higher percentage of empty records**\n\n#%%\n\nleads_scoring = leads_scoring.drop(\n ['How did you hear about X Education', 'Lead Profile'], axis=1)\n\n\n# #### Removing Information about customer that is for company purpose and doesn't serve any use in analysis\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Prospect ID', 'Lead Number'], axis=1)\n\n#%%\n\n# Finding the number of unique values under each collumn\nleads_scoring.nunique()\n\n\n# **Dropping Collumns with single value as it doesn't serve any use for analysis**\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Magazine', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content',\n 'Get updates on DM Content', 'I agree to pay the amount through cheque'], axis=1)\n\n#%%\n\nleads_scoring.isnull().sum()\n\n\n# #### Imputing Missing values in Lead Quality\n\n#%%\n\nleads_scoring.groupby(by='Lead Quality').count()\n\n#%%\n\nround(\n 100*(leads_scoring['Lead Quality'].isnull().sum()/len(leads_scoring.index)), 2)\n\n\n# There are more than 50% missing values in 'Lead Quality' column because of no assignment by X Education employee.\n# We don't have any information about these missing fields hence replacing them by 'Unassigned'\n\n#%%\n\nleads_scoring['Lead Quality'] = leads_scoring['Lead Quality'].replace(\n np.nan, \"Unassigned\")\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Asymmetrique Profile Index','Asymmetrique Profile Score'\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Index']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Activity Score']).Converted.count()\n\n#%%\n\nleads_scoring.groupby(['Asymmetrique Profile Score']).Converted.count()\n\n\n# #### Dropping Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Asymmetrique Activity Index', 'Asymmetrique Activity Score',\n 'Asymmetrique Profile Index', 'Asymmetrique Profile Score'], axis=1)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating City Collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('City').City.count()/len(leads_scoring.index)), 2)\n\n\n# #### 'City' column has approximately 40% missing values. The Collumn has maximum occurence of 'Mumbai' and other values have very few occurences .Thus we cannot impute the collumn with any value and decide to drop it as well\n\n#%%\n\nleads_scoring.drop('City', axis=1, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Tags collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Tags').Tags.count()/len(leads_scoring.index)), 2)\n\n\n# Since we don't know what might be the status of missing value 'Tags', it is better to replace them with value 'Unknown'\n\n#%%\n\nleads_scoring['Tags'] = leads_scoring['Tags'].replace(np.nan, 'Unknown')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Specialization collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Specialization').Specialization.count() /\n len(leads_scoring.index)), 2)\n\n\n# 37% values are missing in 'Specialization' & we don't have any information about those missing value prospects. Hence replacing the null values with 'Specialization Not given'\n\n#%%\n\nleads_scoring['Specialization'] = leads_scoring['Specialization'].replace(\n np.nan, 'Specialization Not given')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What is your current occupation missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What is your current occupation')\n ['What is your current occupation'].count()/len(leads_scoring.index)), 2)\n\n\n# 60% of Data is Unemployed, however it'd be wrong to impute this value ,hence we will impute it with Other\n\n#%%\n\nleads_scoring['What is your current occupation'] = leads_scoring['What is your current occupation'].replace(\n np.nan, 'Other')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating What matters most to you in choosing a course collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# In 'What matters most to you in choosing a course' 71% values are 'Better Career Prospects'. Missing values are 29%. It makes sense both logically as well as business point of view to impute the collumn value with 'Better Career Prospects'\n\n#%%\n\nleads_scoring['What matters most to you in choosing a course'] = leads_scoring['What matters most to you in choosing a course'].replace(\n np.nan, 'Better Career Prospects')\n\n#%%\n\nround(100*(leads_scoring.groupby('What matters most to you in choosing a course')\n ['What matters most to you in choosing a course'].count()/len(leads_scoring.index)), 2)\n\n\n# #### After imputing the values we can see that 99.97% of the collumn value is Better Career Prospects . Thus it can be dropped as the main reason that customers take course is for Better Career Prospects and doesn't help in analysis\n\n#%%\n\nleads_scoring.drop(\n 'What matters most to you in choosing a course', axis=1, inplace=True)\n\n\n# 'What is your current occupation' has ~29% missing values. 60% prospects are Unemployed. But it is unsafe to replace the missing fields with 'Unemployed'\n# Hence replacing the missing field with 'Other'\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Country collumn missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Country').Country.count()/len(leads_scoring.index)), 2)\n\n\n# Country India is the maximum occuring value in Collumn Country thus imputing missing values with this value\n\n#%%\n\nleads_scoring['Country'] = leads_scoring['Country'].replace(np.nan, 'India')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Inspecting & Treating missing values in 'Lead Source'\n\n#%%\n\nround(100*(leads_scoring.groupby('Page Views Per Visit')\n ['Page Views Per Visit'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['Page Views Per Visit'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Total Visits column\n\n#%%\n\nround(100*(leads_scoring.groupby('TotalVisits')\n ['TotalVisits'].count()/len(leads_scoring.index)), 2)\n\n\n# 0.0 is the maximum occuring values , thus we will impute collumn with the same\n\n#%%\n\nleads_scoring['TotalVisits'].replace(np.nan, 0.0, inplace=True)\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating missing values in Last Activity collumn\n\n#%%\n\nround(100*(leads_scoring.groupby('Last Activity')\n ['Last Activity'].count()/len(leads_scoring.index)), 2)\n\n\n# Replacing nan values with maximum occuring value that is Email Opened\n\n#%%\n\nleads_scoring['Last Activity'] = leads_scoring['Last Activity'].replace(\n np.nan, 'Email Opened')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Treating Lead Source missing values\n\n#%%\n\nround(100*(leads_scoring.groupby('Lead Source')\n ['Lead Source'].count()/len(leads_scoring.index)), 2)\n\n#%%\n\n# Google is appearing twice in different case letters, removing this inconsistency\nleads_scoring['Lead Source'] = np.where(\n leads_scoring['Lead Source'] == \"google\", \"Google\", leads_scoring['Lead Source'])\n\n#%%\n\nround(100*(leads_scoring['Lead Source'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# In lead Source column, replacing null values with most occurring value \"Google\"\n\n#%%\n\nleads_scoring['Lead Source'] = leads_scoring['Lead Source'].replace(\n np.nan, 'Google')\n\n#%%\n\nround(100*leads_scoring.isnull().sum()/len(leads_scoring.index), 2)\n\n\n# #### Thus all missing values have been handled\n\n# **Treating columns based on value frequency**\n\n#%%\n\nleads_scoring.nunique()\n\n\n# **Checking the column frequencies where only '2' types of values exits.**\n\n#%%\n\nround(\n 100*(leads_scoring['Do Not Email'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Do Not Call'].value_counts() /\n len(leads_scoring.index)), 2)\n\n\n# #### The column \"Do not Call\" has almost all values as \"No\", hence this column can be safely dropped in absence of variabilty.\n\n#%%\n\nleads_scoring.drop('Do Not Call', axis=1, inplace=True)\n\n#%%\n\nround(100*(leads_scoring['Search'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Search', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Newspaper Article'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper Article',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['X Education Forums'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['X Education Forums',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*(leads_scoring['Newspaper'].value_counts() /\n len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Newspaper', 'Converted']\n ).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Digital Advertisement'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Digital Advertisement',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n#%%\n\nround(\n 100*(leads_scoring['Through Recommendations'].value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['Through Recommendations',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **It can be noted that customer who said \"Yes\" in above 6 columns is a very small percentage and the conversion percentage of each is almost negligible.So the fields marked \"Yes\" don't serve the variance of the dataset and thus doesn't prove to be useful to the analysis.\n# Hence we have chosen to drop the following columns**\n# * Search\n# * Newspaper Article\n# * X Education Forums\n# * Newspaper\n# * Digital Advertisement\n# * Through Recommendations\n#\n\n#%%\n\nleads_scoring = leads_scoring.drop(['Search', 'Newspaper Article', 'X Education Forums',\n 'Newspaper', 'Digital Advertisement', 'Through Recommendations'], axis=1)\n\n#%%\n\nround(100*(leads_scoring['A free copy of Mastering The Interview']\n .value_counts()/len(leads_scoring.index)), 2)\n\n#%%\n\nround(100*((leads_scoring.groupby(['A free copy of Mastering The Interview',\n 'Converted']).Converted.count())/len(leads_scoring.index)), 2)\n\n\n# **Outlier Treatment**\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n\n# **Retaining data within 3 time Std. Dev for each column of the following collumns**\n# * Total Time Spent on Website\n# * Page Views Per Visit\n\n#%%\n\ndef remove_outlier(df, Data):\n df_out = df[np.abs(df[Data]-df[Data].mean()) <= (3*df[Data].std())]\n return df_out\n\n#%%\n\nleads_scoring = remove_outlier(leads_scoring, 'Total Time Spent on Website')\nleads_scoring = remove_outlier(leads_scoring, 'Page Views Per Visit')\n\n#%%\n\nround(leads_scoring.describe(), 2)\n\n#%%\n\nround(100*(leads_scoring.Converted.value_counts()/len(leads_scoring.index)), 2)\n\n\n# #### After Outlier Treatment the data has 38.49% cases of Conversion.\n\n# **Dummy variable creation for Logistic regression.**\n\n#%%\n\ndummy_df = pd.get_dummies(leads_scoring[['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity']], drop_first=True)\n\n# Adding the results to the master dataframe\nleads_scoring_model = pd.concat([leads_scoring, dummy_df], axis=1)\n\n#%%\n\n# List of variables to map\n\nvarlist = ['Do Not Email', 'A free copy of Mastering The Interview']\n\n# Defining the map function\n\n\ndef binary_map(x):\n return x.map({'Yes': 1, \"No\": 0})\n\n\n# Applying the function to the housing list\nleads_scoring_model[varlist] = leads_scoring_model[varlist].apply(binary_map)\n\n#%%\n\nleads_scoring_model.drop(['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization',\n 'What is your current occupation', 'Tags', 'Lead Quality', 'Last Notable Activity'], axis=1, inplace=True)\n\n#%%\n\nleads_scoring_model.info()\n\n#%%\n\nleads_scoring_model.shape\n\n#%%\n\nleads_scoring_model.head()\n\n\n# ## Data Modelling\n# ### Test Train Split\n\n#%%\n\n\n\n#%%\n\n# Putting feature variable to X\nX = leads_scoring_model.drop(['Converted'], axis=1)\nX.head()\n\n#%%\n\n# Putting response variable to y\ny = leads_scoring_model['Converted']\n\ny.head()\n\n\n# #### Splitting the data into train and test\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, train_size=0.7, test_size=0.3, random_state=100)\n\n\n# **Feature Scaling**\n\n#%%\n\n\n\n#%%\n\nscaler = StandardScaler()\nX_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']] = scaler.fit_transform(\n X_train[['Total Time Spent on Website', 'TotalVisits', 'Page Views Per Visit']])\nX_train.head()\n\n#%%\n\nX_train.describe()\n\n#%%\n\n# Importing matplotlib and seaborn\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Model Building\n\n#%%\n\n\n\n#%%\n\nX_train.groupby('Lead Quality_Low in Relevance').count()\n\n#%%\n\n# Logistic regression model\nlogistic_model = sm.GLM(y_train, (sm.add_constant(\n X_train)), family=sm.families.Binomial())\nlogistic_model.fit().summary()\n\n\n# **Feature Selection Using RFE**\n\n#%%\n\nlogreg = LogisticRegression()\n\n#%%\n\nrfe = RFE(logreg, 15) # running RFE with 15 variables as output\nrfe = rfe.fit(X_train, y_train)\n\n#%%\n\n\n\n#%%\n\nlist(zip(X_train.columns, rfe.support_, rfe.ranking_))\n\n#%%\n\ncols = X_train.columns[rfe.support_]\n\n#%%\n\nX_train.columns[~rfe.support_]\n\n\n# ### The following columns are required for Building the model\n\n#%%\n\ncols\n\n#%%\n\nX_train.shape\n\n#%%\n\nX_train[cols].shape\n\n\n# ##### Assessing the model with StatsModels\n\n#%%\n\nX_train_sm = sm.add_constant(X_train[cols])\nlogistic_model2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())\nres = logistic_model2.fit()\nres.summary()\n\n\n# #### Getting the predicted values on the train data set\n\n#%%\n\ny_train_pred = res.predict(X_train_sm)\ny_train_pred[:10]\n\n#%%\n\ny_train_pred = y_train_pred.values.reshape(-1)\ny_train_pred[:10]\n\n\n# #### Creating a dataframe with the original 'Converted' flag and the 'Predicted_Conversion' flag value also calculating 'Converted_Prob' & 'Lead_Score'\n\n#%%\n\ny_train_pred_final = pd.DataFrame(\n {'Converted': y_train.values, 'Converted_Probability': y_train_pred})\ny_train_pred_final.head()\n\n#%%\n\ny_train_pred_final['Lead_Score'] = round(\n (y_train_pred_final['Converted_Probability']*100))\ny_train_pred_final.head()\n\n\n# ##### Creating new column 'predicted' with 1 if Converted_Prob > 0.5 else 0\n\n#%%\n\ny_train_pred_final['Predicted_Conversion'] = y_train_pred_final.Converted_Probability.map(\n lambda x: 1 if x > 0.5 else 0)\n# Let's see the head\ny_train_pred_final.head()\n\n#%%\n\n\n\n#%%\n\n# Confusion matrix\nconfusion = metrics.confusion_matrix(\n y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\nprint(confusion)\n\n#%%", "original_comment": "# Let's check the overall accuracy.\n", "target_code": "print(metrics.accuracy_score(y_train_pred_final.Converted,\n y_train_pred_final.Predicted_Conversion))\n", "project_metadata": {"full_name": "saad1504/Upgrad_DataScience_Projects", "description": "All Data Science projects completed for PGPDS by Upgrad", "topics": [], "git_url": "git://github.com/saad1504/Upgrad_DataScience_Projects.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2019-10-14T16:57:22Z", "size": 29931, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 6008971, "PLSQL": 11605}, "last_updated": "2020-10-12T22:18:23Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted_Conversion)\n", "model": "docstring", "intent": "# Let's check the overall accuracy."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport shap\nfrom pdpbox import pdp, info_plots # for partial plots\nfrom sklearn.metrics import confusion_matrix # for model evaluation\nfrom sklearn.metrics import roc_curve, auc # for model evaluation\nfrom sklearn.tree import export_graphviz # plot tree\nfrom sklearn.ensemble import RandomForestClassifier # for the model\nfrom sklearn.model_selection import train_test_split # for data splitting\nfrom sklearn.model_selection import ShuffleSplit\nimport xgboost as xgb\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.svm import SVR\nfrom sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import cross_validate\nfrom sklearn import linear_model\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\n#%%\n\ndef accuracy(y, y_pred):\n return np.mean(y == y_pred)\n\n#%%\n\ncol_names = ['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating',\n 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ']\ncol_names = [s.strip().lower().replace(\" \", \"_\") for s in col_names]\n\n#%%\n\ndf_pred = pd.read_csv(\"datasets/pred.csv\")\ndf_pred.columns = col_names\ndf_pred[col_names[-1]][df_pred[col_names[-1]] > 0.5] = 1\ndf_pred[col_names[-1]][df_pred[col_names[-1]] < 0.5] = 0\ndf_pred[col_names[-1]] = df_pred[col_names[-1]].astype(int)\ndf_pred = df_pred.set_index(col_names[0])\nprint(df_pred.shape)\ndf_pred.head()\n\n#%%\n\ndf_train = pd.read_csv(\"datasets/train.csv\")\ndf_train.columns = col_names\ndf_train[col_names[-1]][df_train[col_names[-1]] > 0.5] = 1\ndf_train[col_names[-1]][df_train[col_names[-1]] < 0.5] = 0\ndf_train[col_names[-1]] = df_train[col_names[-1]].astype(int)\ndf_train = df_train.set_index(col_names[0])\nprint(df_train.shape)\ndf_train.head()", "original_comment": "# #### Check Missing Value\n", "target_code": "for col in df_train.columns:\n if any(df_train[col].isnull()):\n print(\"feature %s, missing %i entries\" %\n (col, sum(df_train[col].isnull())))\n else:\n print(\"feature %s has no missing value\" % col)\n", "project_metadata": {"full_name": "shawlu95/Data-Science-Toolbox", "description": "Examples and illustration of basic statistic concepts, probability distribution, Monte Carlo simulation, preprocessing and visualization techniques, and statistical testing.", "topics": [], "git_url": "git://github.com/shawlu95/Data-Science-Toolbox.git", "stars": 28, "watchers": 28, "forks": 11, "created": "2019-03-25T19:58:55Z", "size": 157445, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 52401937, "Python": 36992, "TSQL": 3834, "PLpgSQL": 3609, "Shell": 3459, "R": 1437}, "last_updated": "2020-12-26T18:51:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "precision": "Strongly disagree", "precision-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "usefulness": "Strongly disagree", "usefulness-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "df = pd.concat([df_train, df_pred], axis=1)\ndf.head()\n", "model": "no-comments", "intent": "# Check Missing Value"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data\n\n\n\ndef MinMaxScaler(data):\n ''' Min Max Normalization\n Parameters\n ----------\n data : numpy.ndarray\n input data to be normalized\n shape: [Batch size, dimension]\n Returns\n ----------\n data : numpy.ndarry\n normalized data\n shape: [Batch size, dimension]\n References\n ----------\n .. [1] http://sebastianraschka.com/Articles/2014_about_feature_scaling.html\n '''\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Stock Prediction with RNN\n# RNN\uc744 \uc774\uc6a9\ud55c \uac04\ub2e8\ud55c \uc8fc\uc2dd \uc608\uce21 \ubaa8\ub378\uc744 \ud559\uc2b5\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n#%%\n\n# library import\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.utils import to_categorical\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nprint(tf.__version__)\nprint(keras.__version__)\n\n\n# ## Hyper Parameters\n\n#%%\n\n# train Parameters\nseq_length = 7\ndata_dim = 5\nhidden_size = 10\noutput_dim = 1\nlearning_rate = 0.001\ntraining_epochs = 500\nbatch_size = 25\n\n\n# ## Preparing Data", "original_comment": "# ### MinMax Scaling\n", "target_code": " import numpy as np\n\n numerator = data - np.min(data, 0)\n denominator = np.max(data, 0) - np.min(data, 0)\n return numerator / (denominator + 1e-7)\n", "project_metadata": {"full_name": "jwlee-ml/TensorFlow_Training_13th", "description": "Tensorflow\ub85c \uc2dc\uc791\ud558\ub294 \ub525\ub7ec\ub2dd Camp 13\uae30 \uc2e4\uc2b5", "topics": [], "git_url": "git://github.com/jwlee-ml/TensorFlow_Training_13th.git", "stars": 4, "watchers": 4, "forks": 5, "created": "2019-06-14T14:39:05Z", "size": 23519, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 23325250}, "last_updated": "2019-11-05T13:31:34Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "(x_train, y_train), (x_test, y_test) = mnist.load_data()\nx_train = x_train.astype('float32')\nx_test = x_test.astype('float32')\nx_train /= 255\nx_test /= 255\nprint(x_train.shape[0], 'train samples')\nprint(x_test.shape[0], 'test samples')\ny_train = to_categorical(y_train, num_classes=10)\ny_test = to_categorical(y_test, num_classes=10)\n", "model": "natural", "intent": " # MinMax Scaling"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n\n# ##Inspecting the data\n\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n\nsns.countplot(x='Class', data=dat)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \"Open\n\n# # Credit Card Fraud Detection Ensembles\n#\n# Example of classification of unbalanced datasets.\n# Dataset https://www.kaggle.com/mlg-ulb/creditcardfraud from Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00e9 Libre de Bruxelles).\n\n# ##Loading the dataset\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom imblearn.over_sampling import SMOTE\nfrom sklearn import metrics\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split\nimport seaborn as sns\nimport numpy as np\nimport pandas as pd\nget_ipython().system(\n 'wget -O creditfraud.zip https://www.dropbox.com/s/tl20yp9bcl56oxt/creditcardfraud.zip?dl=0 ')\n\n#%%\n\nget_ipython().system('unzip creditfraud.zip')\n\n\n# ##Importing necessary libraries\n\n#%%\n\n# ##Inspecting the data\n\n#%%\n\ndat = pd.read_csv('creditcard.csv')\ndat.head()\n\n#%%\n\n# checking for null values\ndat.isnull().sum().max()\n\n\n# The dataset is hifghly unbalanced\n\n#%%\n\ndat['Class'].value_counts()/dat['Class'].count()\n\n#%%\n\nsns.countplot(x='Class', data=dat)", "original_comment": "# We won't be using \"Time\" variable\n", "target_code": "dat = dat.drop(['Time'], 1)\n", "project_metadata": {"full_name": "dpanagop/ML_and_AI_examples", "description": null, "topics": [], "git_url": "git://github.com/dpanagop/ML_and_AI_examples.git", "stars": 2, "watchers": 2, "forks": 2, "created": "2019-07-16T10:55:13Z", "size": 12192, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5902376}, "last_updated": "2020-11-24T20:45:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "sns.countplot(x='Time', data=dat)\n", "model": "natural", "intent": "# We won't be using \"Time\" variable"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Newsletter 5. Vector Calculus: Work (Line Integrals) and Green's Theorem\n\n# ## Libraries and main settings\n\n#%%\n\n# Numerical computation\nimport numpy as np\n\n# graph library\nimport matplotlib.pyplot as plt\n\n# 3d frame\nfrom mpl_toolkits.mplot3d import Axes3D\n\n# To recognize LaTeX commands\nplt.rc('text', usetex=True)\n\n# font family\nplt.rc('font', family='serif')\n\n# style sheet\nplt.style.use('dark_background')\n\n# change the background color\nc_background = '#363636'\n\n\n# [_Style Sheets_](https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html)\n\n# ## Work and Line integrals\n\n# ### Plot 1. Vector field + curve\n\n# **Parametric curve**\n# $$\\vec{r}(t) = t \\ \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} +2\\left( t-2 \\right)^{3} \\right)\\hat{j} \\\\\n# 0\\leq t \\leq 2.8$$\n#\n# **Vector Field**\n# $$\\vec{F}(x,y) = \\sin(x) \\ \\hat{i} + \\cos(y) \\ \\hat{j}$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(0, 5, 25)\nY = np.linspace(0, 5, 25)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 2. Work (Line Integral in a Vector Field) from a geometric point of view\n\n# $$W = \\int_C \\vec{F}\\cdot\\text{d}\\vec{r} = \\int_{a}^{b} \\vec{F}\\left( \\vec{r} (t) \\right)\\cdot \\vec{r}' (t) \\ \\text{d}t$$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2.855, 100)\n\n# parametric equation of the curve\nX_curve = t\nY_curve = 5*((t - 2)**2) + 2*((t - 2)**3)\n\n# ------------------ r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 1\nV_dcurve = 4 - 14*t + 6*(t**2)\n\n\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = t \\hat{i} + \\left( 5\\left( t-2 \\right)^{2} + 2\\left( t-2 \\right)^{3}\\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot ( r'(t) )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U_dcurve, V_dcurve, # value of each vector\n color='#C3C3C3',\n width=0.002,\n headwidth=4,\n headlength=6,\n label=r'$\\mathrm{d}\\vec{r} = 1\\hat{i} + \\left( 4-14t + 6t^{2} \\right)\\hat{j}$'\n )\n\n# Create the vector plot ( Vector field )\nplt.quiver(X_curve, Y_curve, # Position of each vector\n U*Magnitude, V*Magnitude, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=4,\n headlength=6,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# limit the plot\nplt.xlim(0, 3) # show between 0 and 3\nplt.ylim(-0.1, 5) # show between -0.1 and 5\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ## Green's Theorem\n\n# $$\\oint_{C}\\vec{F}\\cdot\\text{d}\\vec{r} = \\iint_{R} \\left( \\frac{\\partial F_{y} }{\\partial x} - \\frac{\\partial F_{x}}{\\partial y} \\right) \\text{d}A$$\n\n# ### Plot 3. Vector Field + Closed Curve $r(t) =\\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n# ----------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = np.linspace(-5, 5, 30)\nY = np.linspace(-5, 5, 30)\n\n\n# Points in the XY plane\nX, Y = np.meshgrid(X, Y)\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 100)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# ------------------------------ PLOT TIME ---------------------------------------------\n\n# Create figure and dimensions\nplt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\nax = plt.axes()\nax.set_facecolor(c_background) # change the color of the background\n\n# plot the line\nplt.plot(X_curve, Y_curve,\n label=r'$\\vec{r}(t) = \\left( 3\\cos(t)\\left( 1-\\cos(t) \\right) + 2 \\right) \\hat{i} + \\left( 3\\sin(t)\\left( 1-\\cos(t) \\right) \\right) \\hat{j}$',\n color='#E8175D'\n )\n\n\n# Create the vector plot\nplt.quiver(X, Y, # Position of each vector\n U, V, # value of each vector\n Magnitude, # magnitude of each vector, this will help the cmap\n width=0.002,\n headwidth=3,\n headlength=5,\n cmap=plt.cm.magma # assigns a color to each vector depending its magnitude\n )\n\n# Add title and label to the axes\nplt.title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nplt.xlabel(r'$x$', size=20)\nplt.ylabel(r'$y$', size=20)\n\n# change size of the tick params\nplt.tick_params(labelsize=15)\n\n# create a legend for the line\nlegend = plt.legend(loc=9, prop={'size': 15})\nlegend.get_frame().set_facecolor(c_background)\n\n# create a color bar for the vector field.\n# the color bar shows us the color assigned to the magnitude of the vector\ncolor_bar = plt.colorbar(orientation='vertical',\n pad=0.05,\n )\n\n# Set a title to the colorbar\ncolor_bar.set_label(label='Magnitud del vector',\n size=20,\n labelpad=15, # separation of the title from the color bar\n )\n\n# change the size of the numbers in the colorbar\ncolor_bar.ax.tick_params(labelsize=15)\n\nplt.tight_layout()\nplt.show()\n\n\n# ### Plot 4. Green's Theorem (Line Integral)\n\n#%%\n\n# non interactive plots\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n# ------------------------------------ PARAMETRIC CURVE --------------------------------------\n# parameter t\nt = np.linspace(0, 2*np.pi, 150)\n\n# parametric equation of the curve\nX_curve = 3*np.cos(t)*(1 - np.cos(t)) + 2\nY_curve = 3*np.sin(t)*(1 - np.cos(t))\n\n# --------------------------------- r'(t) OF THE CURVE ---------------------------------------\nU_dcurve = 3*(-1 + 2*np.cos(t))*np.sin(t)\nV_dcurve = 3*(1 + 2*np.cos(t))*((np.sin(t/2))**2)\n\n# -------------------------------------------- VECTOR FIELD ----------------------------------\n# Domain of the vector field\nX = X_curve\nY = Y_curve\n\n# Value each component of the vector field => F(x,y) = u*i + v*j\nU = np.sin(X)\nV = np.cos(Y)\n\n# magnitude of all the vectors\nMagnitude = np.sqrt(U**2 + V**2)\n\n# Normalize so all of them have magnitude of one\nU = U/Magnitude\nV = V/Magnitude\n\n# ----------------------------------- PLOT TIME ---------------------------------------------", "original_comment": "# Create figure and dimensions\n", "target_code": "plt.figure(figsize=(10.6, 6),\n dpi=200, # make your plot an HDPlot, remove it if you're not in a jupyter notebook\n facecolor=c_background # change the color of the background\n )\n", "project_metadata": {"full_name": "isaacarroyov/ss_plots", "description": "Repositorio de gr\u00e1ficas realizadas en Python para mis boletines de servicio social (Ecuaciones Diferenciales y An\u00e1lisis Vectorial) || Repository of the plots made in Python for my social service bulletins (Differential Equations and Vector Calculus)", "topics": ["differential-equations", "math", "vector-analysis", "university", "python3", "python", "ecuaciones-diferenciales"], "git_url": "git://github.com/isaacarroyov/ss_plots.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2020-08-27T19:15:30Z", "size": 21849, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 29758848}, "last_updated": "2020-11-24T18:53:41Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "fig, ax = plt.subplots(figsize=(10, 10))\nax.plot(X_curve, Y_curve)\nax.set_title(r'$\\vec{F} = \\sin(x)\\hat{i} + \\cos(y) \\hat{j}$',\n size=25,\n pad=15)\nax.set_xlabel(r'$x$', size=20)\nax.set_ylabel(r'$y$', size=20)\n# change size of the tick params\n# create a legend for the line\nlegend = plt.legend(loc=9,\n", "model": "natural", "intent": "# Create figure and dimensions"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # CIFAR10 Hands-on\n#\n# -------------------------------\n#\n# Notebook below shows how to manipulate images, train/test Convolutional Neural Network and visualize the learning results on CIFAR10 dataset. More information about the dataset can be found on the [Alex Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html).\n#\n# Before running below hands-on, recall all your knowledge about:\n# - Training neural networks with SGD,\n# - Convolutional Neural Networks.\n#\n# You can also refer to my presentation that you can find in the root directory of this repository. HTML version with all the GIFs is available [here](https://mega.nz/#%21H4IEnZKJ%21so0Czkp8lcLWCt0o3O912WnKZBFjkvZFeJG23kITpig).\n\n# ### Before you start\n#\n# In below code you can find many tags that highlight places on which you can work.\n#\n# **Available tags:**\n# - `[TRY ME]` - places where you can change some values and try how such entries affect other components,\n# - `[TODO]` - places where you have to write your own implementation for some functions/parts of code.\n#\n# Let's start :)\n\n# ### Prepare dataset\n# At first, let's prepare the dataset with all the images and classes. We'll use `torchvision` package which is great to start working with the most popular datasets with just one line of code!\n\n#%%\n\n# Let's import all the packages we will use during this hands-on\nfrom torchvision.datasets import CIFAR10\nfrom torch.utils.data.sampler import SubsetRandomSampler\nfrom torch.utils.data import DataLoader\nfrom torch.autograd import Variable\nimport torch.optim as optim\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport torch\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport random\nimport pickle\nimport os\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n# Now, let's load our dataset. The CIFAR-10 dataset consists of **60000 32x32 colour images in 10 classes**, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset will download automatically into the root directory of this repository.\n\n#%%\n\ntrain_dataset = CIFAR10('.', train=True, download=True)\nprint('There are {} training pictures.'.format(len(train_dataset)))\n\n\n# As you can see above, all of the images have already been splitted into train and test set by the `torchvision` library. With this handy trick, we can save our time working with the original files :)\n#\n# Now, let's get all available labels from the metadata file stored together with the CIFAR10 images. It's a simple Python dictionary pickled into a file, so the only thing we need to do is load it and read the `label_names` field.\n\n#%%\n\nwith open('./cifar-10-batches-py/batches.meta', 'rb') as metadata:\n LABELS = pickle.load(metadata)['label_names']\nprint('All available classes: {}.'.format(LABELS))\n\n\n# Let's look into the dataset itself and visualise an example image.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Below image shows: {}'.format(LABELS[example_class]))\n# [TRY ME] Check interpolation methods, eg. 'gaussian'\nplt.imshow(example_image)\n\n\n# ### Data preprocessing\n\n# We will start our preprocessing with conversion of the images from the CIFAR10 datasets to the numpy arrays. Currently, they are PIL Images, which makes them unuseable with any of the available Machine Learning frameworks.\n\n#%%\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types before conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\ndef conversion_to_numpy(example): return (np.array(example[0]), example[1])\n\n\ntrain_dataset = list(map(conversion_to_numpy, train_dataset))\n\nexample_image, example_class = random.choice(train_dataset)\nprint('Types after conversion: ({}, {})'.format(\n type(example_image), type(example_class)))\n\n\n# Once we've prepared data for further manipulation, it's time to split our initial training dataset into train and validation sets!\n\n#%%\n\n# [TRY ME] Proportion in which we should split training dataset into smaller sets\nVALIDATION = 0.2\n\n# Let's compute where we should split our training dataset\nnumber_of_training_examples = len(train_dataset)\nindices = list(range(number_of_training_examples))\nsplitting_point = int(np.floor(VALIDATION * number_of_training_examples))\n\n# Shuffle all the indices, so our dataset will be equally distributed\nnp.random.shuffle(indices)\n\n# Split the indices in the splitting point\ntrain_idx, valid_idx = indices[splitting_point:], indices[:splitting_point]\n\n# Prepare training and validation datasets with examples\ntraining_images = [train_dataset[i][0] for i in train_idx]\ntraining_classes = [train_dataset[i][1] for i in train_idx]\nvalidation_images = [train_dataset[i][0] for i in valid_idx]\nvalidation_classes = [train_dataset[i][1] for i in valid_idx]\nprint('Initial training dataset has: {} examples.'.format(len(train_dataset)))\nprint('Now, training dataset has: {} examples.'.format(len(training_images)))\nprint('Now, validation dataset has: {} examples.'.format(len(validation_images)))\n\n\n# #### Recall from CS231n course\n#\n# ![Data Preprocessing](./assets/data_preprocessing.jpeg)\n#\n# > **Common pitfall.** An important point to make about the preprocessing is that any preprocessing statistics (e.g. the data mean) must only be computed on the training data, and then applied to the validation / test data. E.g. computing the mean and subtracting it from every image across the entire dataset and then splitting the data into train/val/test splits would be a mistake. Instead, the mean must be computed only over the training data and then subtracted equally from all splits (train/val/test).\n# http://cs231n.github.io/neural-networks-2/#datapre\n\n# Now, let's compute mean and standard deviation, which will be used to zero center and normalize dataset.\n\n#%%\n\n# Calculate mean and std dev for all images from the training dataset\nMEAN_IMAGE = np.mean(training_images, axis=0)\nSTD_DEV_IMAGE = np.std(training_images, axis=0)\n\n# Let's visualize them!\nfig, subplots = plt.subplots(1, 2)\nsubplots[0].set_title('Mean')\nsubplots[0].imshow(MEAN_IMAGE)\nsubplots[1].set_title('Std')\nsubplots[1].imshow(STD_DEV_IMAGE)\n\n\n# Once, we've got mean and standard derivative let's apply them to our datasets (both training and validation dataset).\n\n#%%\n\ntraining_images = (training_images - MEAN_IMAGE) / STD_DEV_IMAGE\nvalidation_images = (validation_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n\n# The one last thing is strictly connected with the way we will create our model. The standard approach (in the most frameworks/papers) is to use the \"channels first\" order, where the first dimension of the input array is the feature channel. Right now, our images have `(NUMBER_OF_IMAGES, 32, 32, 3)` shape. In order to fit them into the neural network, we've got to swap the last dimension with the second one, so our images will follow the `[BATCH, CHANNEL, IMAGE_Y, IMAGE_X]` approach.\n#\n# **Remember** to always work on both the training and validation dataset! Later in this notebook, we will also apply such transformations to the test set.\n\n#%%\n\nprint('Training examples before: {}'.format(training_images.shape))\nprint('Validation examples before: {}'.format(validation_images.shape))\n\ntraining_images = np.swapaxes(training_images, 2, 3)\ntraining_images = np.swapaxes(training_images, 1, 2)\nvalidation_images = np.swapaxes(validation_images, 2, 3)\nvalidation_images = np.swapaxes(validation_images, 1, 2)\n\nprint('Training examples after: {}'.format(training_images.shape))\nprint('Validation examples after: {}'.format(validation_images.shape))\n\n\n# Images are now ready to be used! But... our classes needs to be transformed into \"one hot\" format.\n#\n# **One Hot** format is a way to represent our integer classes with the representation of **distribution probability** for each available class. Such values will be reconstructed by the neural network on the last layer.\n#\n# _**Example one hot mapping:**_\n#\n# | Class | Previously | One Hot Representation |\n# |------------|------------|--------------------------------|\n# | airplane | 0 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | automobile | 1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |\n# | bird | 2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |\n# | ... | ... | ... |\n#\n# Let's convert our classes to the One Hot format.\n\n#%%\n\ndef convert_to_one_hot(old_class):\n \"\"\"Convert classic integer label to the 'one hot' format\"\"\"\n # [TODO] Conversion to One Hot format\n return old_class\n\n\ntraining_classes = list(map(convert_to_one_hot, training_classes))\nvalidation_classes = list(map(convert_to_one_hot, validation_classes))\n\n\n# Our training and validation examples are now ready to be used for training our Convolutional Neural Network!\n#\n# Yay! :)\n\n# ### Prepare CNN model\n# It's high time to prepare our Convolutional Neural Network model! We'll use PyTorch to do so :)\n#\n# **Why PyTorch?** Mostely, because it's great for learning! It shows all the inside things that has to happen to train our network. Every other framework will do many of these things for us but here we've got to do it on our own. What's more, PyTorch is written in pure Python, which makes it great to experiment with. It's also Open Source, so you can look inside of the code, ask people about it (community is great) and even work on your own!\n#\n# Let's prepare the model!\n\n#%%\n\nclass ConvolutionalNeuralNetwork(nn.Module):\n \"\"\"Our Convolutional Neural Network model\"\"\"\n\n def __init__(self):\n \"\"\"Initialize the network components\"\"\"\n super(ConvolutionalNeuralNetwork, self).__init__()\n # [TODO] 2DConv with 3 input channels, 32 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 32 input channels, 64 filters output and 3x3 kernel filter\n # [TODO] 2DConv with 64 input channels, 128 filters output and 3x3 kernel filter\n # [TODO] Dense linear layer with 512 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 128 output neurons\n # [TODO] Dense linear layer with 128 input neurons and 10 output neurons\n\n def forward(self, x):\n \"\"\"Run forward pass of the network\"\"\"\n # Current x: [BATCH_SIZE, 3, 32, 32]\n # [TODO] First 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 32, 15, 15]\n # [TODO] Second 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 64, 6, 6]\n # [TODO] Third 2DConv with ReLu and 2DMaxPooling with 2x2 filters\n\n # Current x: [BATCH_SIZE, 128, 2, 2]\n # [TODO] Flatten x, so we'll be able to pass it into the linear layer\n\n # Current x: [BATCH_SIZE, 512]\n # [TODO] First linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Second linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 128]\n # [TODO] Third linear layer with ReLu\n\n # Current x: [BATCH_SIZE, 10]\n return x\n\n def get_number_of_flat_features(self, x):\n \"\"\"Calculate number of flat features\"\"\"\n size = x.size()[1:]\n num_features = 1\n for s in size:\n num_features *= s\n return num_features\n\n\n# Now, we've got to create our network by calling the class' initializer.\n\n#%%\n\ncnn = ConvolutionalNeuralNetwork()\nprint(cnn)\n\n\n# ### Train the model\n# Our model is ready to be trained. Before we do so, let's prepare some helper/utility functions.\n#\n# First one will help us with shuffling all given examples. It's very important to shuffle images and classes in the same way, so that they'll be still coupled! We don't want to loose the dataset and mix all the labels :)\n\n#%%\n\ndef shuffle_examples(images, classes):\n \"\"\"Shuffle images & classes and respects they order\"\"\"\n combined = list(zip(images, classes))\n random.shuffle(combined)\n new_images, new_classes = zip(*combined)\n return new_images, new_classes\n\n\n# One of the metrics that we will be using is accuracy. **Accuracy** tell us how many labels were properly classified. To do so, we'll check the best class which was predicted and true label from the dataset. If they are the same, we should increase the accuracy. Accuracy will be representend as a percentage value.\n#\n# That's why we need a function that tell us number of correct labels for given batch:\n\n#%%\n\ndef get_number_of_correct_labels(network_output, true_labels):\n \"\"\"Return number of correctly predicted labels\n\n Correct classification return 1 for given example.\n \"\"\"\n predicted_classes = network_output.topk(\n 1, 1)[1] # Indexes for the Top-1 values\n true_classes = true_labels.topk(1, 1)[1] # Indexes for the Top-1 values\n return predicted_classes.eq(true_classes).float().sum().data[0]\n\n\n# Next, we need a function that will update the figure with Loss and Accuracy on both the training and validation datasets:\n\n#%%\n\ndef update_figures(figure, training_losses, validation_losses, training_accuracies, validation_accuracies):\n \"\"\"Update and draw the figure with 'Loss' and 'Accuracy' plots\"\"\"\n # Clear whole figure - remove all content, titles, legend, everything!\n figure.clear()\n\n # 'Loss' plot\n plt.subplot(2, 1, 1)\n plt.grid(True)\n plt.title('Loss')\n plt.xlabel('Epoch')\n plt.ylabel('Value')\n plt.plot(range(len(training_losses)),\n training_losses, '.r-', label='Training')\n plt.plot(range(len(validation_losses)),\n validation_losses, '.b-', label='Validation')\n\n # 'Accuracy' plot\n plt.subplot(2, 1, 2)\n plt.grid(True)\n plt.title('Accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Value [%]')\n plt.plot(range(len(training_accuracies)),\n training_accuracies, '.r-', label='Training')\n plt.plot(range(len(validation_accuracies)),\n validation_accuracies, '.b-', label='Validation')\n\n # Final rendering\n plt.tight_layout() # Make all the above plots look neat and tidy\n plt.legend(bbox_to_anchor=(1, 0), loc='lower right',\n bbox_transform=figure.transFigure, ncol=3)\n figure.canvas.draw() # Update the figure\n\n\n# Before we define functions that will train our Convolutional Neural Network, let's define the hyperparameters for our training. These will be:\n# - `BATCH_SIZE` - tell us how many examples are in a single batch,\n# - `LEARNING_RATE` - tell us how much our weights will be updated using optimizer,\n# - `NUMBER_OF_EPOCHS` - tell us how long should we run the training.\n\n#%%\n\n# [TRY ME] All hyper parameters for the training\nBATCH_SIZE = 64\nLEARNING_RATE = 0.1\nNUMBER_OF_EPOCHS = 10\n\n\n# There are also two additional things:\n# - `loss_function` - will be used to compute the loss. In our case it's Binary Cross Entropy,\n# - `optimizer` - defines the opitimizer (algorithm for optimizing weights) which will be used during the training. In our case it's SGD.\n#\n# Feel free to play with these things and check the results of the training :)\n\n#%%\n\n# [TRY ME] Prepare loss function with optimizer\nloss_function = nn.BCELoss() # Binary Cross Entropy\noptimizer = optim.SGD(cnn.parameters(), lr=LEARNING_RATE)\n\n\n# Now, let's define our training function. It will take images and classes as an input and return loss and accuracy as an output. The training will use mini-batches with the size defined above.\n\n#%%\n\ndef train(training_images, training_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Train our network in batches\n NUMBER_OF_TRAINING_BATCHES = int(len(training_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_TRAINING_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number, NUMBER_OF_TRAINING_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = training_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = training_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's train the network!\n # [TODO] Reset all gradients in the model\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n # [TODO] Compute gradients needed to tune the network's weights\n # [TODO] Backprop with above gradients\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n training_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the training examples\n training_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(training_images)\n\n return training_loss, training_accuracy\n\n\n# Validation looks (nearly) the same. And... it's acctually a copy-paste :) The only thing that has changed are not computing the gradients and not updating weights with backpopagation.\n\n#%%\n\ndef validation(validation_images, validation_classes):\n _epoch_losses = [] # Keep losses for each batch\n # Contains numbers of properly classified images per batch\n _epoch_properly_classified = []\n\n # Validate our network in batches\n NUMBER_OF_VALIDATION_BATCHES = int(len(validation_images) / BATCH_SIZE)\n for batch_number in range(NUMBER_OF_VALIDATION_BATCHES):\n if batch_number % 100 == 0:\n print('Batch #{}/{}...'.format(batch_number,\n NUMBER_OF_VALIDATION_BATCHES))\n\n # Take batch of images & classes and convert them to the PyTorch Variable for further use\n batch_images = validation_images[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_images = Variable(torch.from_numpy(\n np.array(batch_images)).float())\n batch_classes = validation_classes[batch_number *\n BATCH_SIZE:(batch_number+1) * BATCH_SIZE]\n batch_classes = Variable(torch.from_numpy(\n np.array(batch_classes)).float())\n\n # Let's validate the network!\n # [TODO] Compute output based on input images\n # [TODO] Compute loss based on output and true classes\n\n # Remember metrics for this batch\n _epoch_losses.append(loss.data[0])\n _epoch_properly_classified.append(\n get_number_of_correct_labels(net_output, batch_classes))\n\n # Loss for this epoch is equal to the mean of all the losses collected for each batch\n validation_loss = np.mean(np.array(_epoch_losses))\n\n # Accuracy for this epoch is equal to all the correctly classified images\n # divided by all of the validation examples\n validation_accuracy = 100. * \\\n np.sum(_epoch_properly_classified) / len(validation_images)\n\n return validation_loss, validation_accuracy\n\n\n# Our plots needs to store the history somewhere, so let's define places for them now.\n\n#%%\n\n# Clear history of the training losses and accuracies\ntraining_losses = []\nvalidation_losses = []\ntraining_accuracies = []\nvalidation_accuracies = []\n\n\n# Now, let's do the training!\n\n#%%\n\n# Prepare figure to show losses and accuracy\nplt.close()\nfigure = plt.figure()\nupdate_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n# Train the network in epochs\nfor epoch in range(NUMBER_OF_EPOCHS):\n print('Starting epoch #{}.'.format(epoch))\n\n # Let's shuffle all the training & validation examples\n training_images, training_classes = shuffle_examples(\n training_images, training_classes)\n validation_images, validation_classes = shuffle_examples(\n validation_images, validation_classes)\n\n # Train our network\n training_loss, training_accuracy = train(training_images, training_classes)\n training_losses.append(training_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n training_accuracies.append(training_accuracy)\n\n # Debug logging and update the figures\n print(' Training loss: {:.4f}.'.format(training_loss))\n print(' Training accuracy: {:.2f}%'.format(training_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # Validate our network\n validation_loss, validation_accuracy = validation(\n validation_images, validation_classes)\n validation_losses.append(validation_loss) # History for 'Loss' plot\n # History for 'Accuracy' plot\n validation_accuracies.append(validation_accuracy)\n\n # Debug logging and update the figures\n print(' Validation loss: {:.4f}.'.format(validation_loss))\n print(' Validation accuracy: {:.2f}%'.format(validation_accuracy))\n update_figures(figure, training_losses, validation_losses,\n training_accuracies, validation_accuracies)\n\n # [TRY ME] Here, you can add some additional manipulation on optimizer based on\n # training & validation metrics, eg. lower the Learning Rate in case\n # of overfitting.\n\n\n# You can always save the current weights with below method that will store the current state of the network on disk.\n\n#%%\n\n# model_filename = 'model.pt'\n# torch.save(cnn.state_dict(), os.getcwd() + '/' + model_filename)\n\n\n# Similar way you can always restore the state of the network with:\n\n#%%\n\n# model_filename = 'model.pt'\n# cnn = ConvolutionalNeuralNetwork()\n# cnn.load_state_dict(torch.load(os.getcwd() + '/' + model_filename))\n\n\n# ### Testing our solution\n# To test our solution we will use the test dataset delivered with CIFAR10 itself.\n\n#%%\n\ntest_dataset = CIFAR10('.', train=False)\nprint('There are {} test pictures.'.format(len(test_dataset)))\n\n\n# In order to test our neural network properly, we've got to prepare our images in the same way we've done it with the validation dataset. We'll use **the same** mean and standard deviation values as we've used previously. We'll also do **the same** transformations as on train/validation dataset (very important).\n\n#%%\n\n# Convert all images to numpy arrays\ntest_dataset = list(map(conversion_to_numpy, test_dataset))\n\n# Split the test dataset into images and classes\ntest_images = np.array([example[0] for example in test_dataset])\ntest_classes = np.array([example[1] for example in test_dataset])\n\n# Apply **the same** mean and std values to the test examples\ntest_images = (test_images - MEAN_IMAGE) / STD_DEV_IMAGE\n\n# Swap the channels to match the network input ([SIZE, 32, 32, 3] -> [SIZE, 3, 32, 32])\ntest_images = np.swapaxes(test_images, 2, 3)\ntest_images = np.swapaxes(test_images, 1, 2)\n\n# Convert all classes to \"One Hot\" format\ntest_classes = np.array(list(map(convert_to_one_hot, test_classes)))\n\n\n# Also, we'll get the loss and accuracy using validation method. We can do so, because it doesn't do anything more than we want now :) In the future it may happen that validation method may do something more, so be aware about it!\n\n#%%\n\ntest_loss, test_accuracy = validation(test_images, test_classes)\nprint('Test loss: {:.4f}.'.format(test_loss))\nprint('Test accuracy: {:.2f}%'.format(test_accuracy))\n\n\n# Our model is performing somehow. It's not bad but it's also not perfect...\n#\n# Even though, let's visualise some examples and verify the predicted classes on our own :)\n\n#%%\n\n# Close previously opened plot - needed due to interrupting drawing loop of the previous figure\nplt.close()\n\n# Choose random image from the test dataset and prepare input/output for the network\nindex = random.choice(range(len(test_dataset)))\ntest_image, proper_class = test_images[index], test_classes[index]\n\n# Input image has to be expanded with the batch dimension ([3, 32, 32] -> [1, 3, 32, 32])\ntest_image = np.expand_dims(test_image, axis=0)\n\n# Predict class for above random image\n# All the network inputs has to be PyTorch's Variables!\ntest_image = Variable(torch.from_numpy(test_image).float())\npredicted_classes = cnn(test_image)\n\n# Let's get classes based on \"One Hot\" format (which means that we are looking for the\n# index/argument with the maximum value)\npredicted_classes = predicted_classes.data.numpy()\npredicted_class = np.argmax(predicted_classes)\nproper_class = np.argmax(proper_class)\n\n# Show the image with true/predicted classes\nprint('Below image shows: {}'.format(LABELS[proper_class]))\nprint('Our network predicted: {}'.format(LABELS[predicted_class]))\nother_preditions = [(LABELS[idx], _c)\n for idx, _c in enumerate(predicted_classes[0])]\nother_preditions = sorted(\n other_preditions, key=lambda predition: predition[1], reverse=True)\nprint('Other network predictions: {}'.format(other_preditions))\nplt.imshow(test_dataset[index][0])\n\n\n# ### Convolution visualization\n# There are many ways to visualize Convolutional Neural Networks. Here is the simplest one. We'll pass above test image through first two convolutional layers and see the output.\n#\n# More sophisticated methods deals with looking into the weights of the convolution and try to interpret them. These are more complecated and won't be covered by this hands-on. For more information refer to [this CS231n lecture](https://www.youtube.com/watch?v=ta5fdaqDT3M).\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(32, 3, figsize=(6, 64))\n\n# Pass test image through first convolution layer\nconv_pass = cnn.conv1(test_image)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(32):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])\n\n# Annotate plots\nsubplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\nfig.tight_layout()\n\n\n# Let's do the same for the second layer!\n\n#%%\n\n# Prepare figure with subplots\nplt.close()\nfig, subplots = plt.subplots(64, 3, figsize=(6, 128))\n\n# Pass test image through first convolution layer\nfirst_conv = F.max_pool2d(F.relu(cnn.conv1(test_image)), (2, 2))\n\n# Pass test image through second convolution layer\nconv_pass = cnn.conv2(first_conv)\nrelu_pass = F.relu(conv_pass)\npool_pass = F.max_pool2d(relu_pass, (2, 2))\n\n# Fetch numpy data from PyTorch Variables\nconv_pass_numpy = conv_pass[0].data.numpy()\nrelu_pass_numpy = relu_pass[0].data.numpy()\npool_pass_numpy = pool_pass[0].data.numpy()\n\n# Iterate over filters and visualize each of them\nfor i in range(64):\n subplots[i, 0].imshow(conv_pass_numpy[i])\n subplots[i, 1].imshow(relu_pass_numpy[i])\n subplots[i, 2].imshow(pool_pass_numpy[i])", "original_comment": "# Annotate plots\n", "target_code": "subplots[0, 0].set_title('Convolution')\nsubplots[0, 1].set_title('ReLu')\nsubplots[0, 2].set_title('MaxPooling')\n", "project_metadata": {"full_name": "jpowie01/CIFAR10-HandsOn", "description": "Hands-on prepared for one of my presentations that took place on Computer Vision's mini-course at student's orgranization called \"Gradient\" (Gda\u0144sk University of Technology)", "topics": ["deep-learning", "convolutional-neural-networks", "cifar10", "jupyter-notebook", "hands-on"], "git_url": "git://github.com/jpowie01/CIFAR10-HandsOn.git", "stars": 6, "watchers": 6, "forks": 0, "created": "2018-01-03T21:22:35Z", "size": 9589, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1717141}, "last_updated": "2018-01-09T19:26:07Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.annotate('MaxPooling', xy=(0.5, 0.5), xytext=(0.5, 0.5),\n arrowprops=dict(facecolor='black', shrink=0.05))\nplt.annotate('Convolution', xy=(0.5, 0.5), xytext=(0.5, 0.5),\n arrowprops=dict(facecolor='black', shrink=0.05))\nplt.annotate('ReLU', xy=(0.5, 0.5), xytext=(0.5, 0.5),\n arrowprops=dict(facecolor='black\n", "model": "docstring", "intent": "# Annotate plots"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Reference\n#\n# https://www.analytics-link.com/post/2019/07/11/creating-pop-art-using-opencv-and-python\n\n#%%\n\nimport cv2\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport glob\nfrom IPython.display import clear_output\n\n#%%\n\ndef show_img(img):\n image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n plt.imshow(image_rgb)\n plt.show()\n\n#%%\n\ndef img_processing(origin_img):\n\n # set colours (BGR)\n background_colour = [19, 247, 224]\n dots_colour = (247, 19, 217)\n\n # set the max dots (on the longest side of the image)\n max_dots = 120\n\n # import the image as greyscale\n gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)", "original_comment": " # extract dimensions\n", "target_code": " original_image_height, original_image_width = gray_img.shape\n", "project_metadata": {"full_name": "howarder3/ironman2020_OpenCV_photoshop", "description": null, "topics": [], "git_url": "git://github.com/howarder3/ironman2020_OpenCV_photoshop.git", "stars": 2, "watchers": 2, "forks": 1, "created": "2020-09-12T15:55:03Z", "size": 125635, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 131231786}, "last_updated": "2020-12-23T03:20:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "img = cv2.imread(\n r'C:\\Users\\User\\Desktop\\Projects\\urban_sound\\UrbanSound8K\\audio\\fold1-2.wav')\nimg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\nshow_img(img)\n", "model": "no-comments", "intent": " # extract dimensions"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\ndrop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Abstract ##\n#\n# In this Kernel we're going to take a look at [*Decision Trees*][1] using *Python* and the Titanic dataset. It's not intended to be the most accurate Titanic survival model out there, but to explain how to create, visualise and understand *Classification Trees*. The main aspects covered are:\n#\n# - Learning from the data with *Decision Trees*\n# - Dataset exploration and processing\n# - Relevant features for *Decision Trees*\n# - Gini Impurity\n# - Finding best tree depth with the help of cross-validation\n# - Generating and visualising the final model\n#\n# This is my first Kernel, so please feel free to include any suggestions, comments or critics!\n#\n# [1]: https://en.wikipedia.org/wiki/Decision_tree_learning\n\n# Introduction\n# --------------------\n#\n# When applying Machine Learning algorithms, it's critical to always keep in mind the problem we're trying to solve. In most cases, the most accurate and robust model might be what you're looking for. But sometimes we need to actually get insights from the available data and in these cases transparent, easy to understand models like *Decision Trees* will greatly simplify our task.\n#\n# If we need to build a model that will be directly used for some task and **only show it's end results**, then we don't really care about building some kind of \"blackbox\" if it's accurate enough (image or speech recognition for example). That's why advanced techniques such as [*Deep Learning*][1] or [*Ensemble Learning*][2] (cf. [Anisotropic Kernel][3]) are commonly used for complex tasks. But remember the KISS principle (Keep It Simple, Stupid)! Always consider the complexity/accuracy trade-off: complex techniques should only be used if they offer significant improvements. Simpler models are also less prone to over-fitting and tend to generalise better.\n#\n# But if we're using Machine Learning to actually **get insights from the data**, \"blackbox\" models are almost useless and it's best to stick with simpler, transparent techniques. Let's take the case of a supermarket looking to better understand customer behaviour: the straightforward [*Apriori*][4] algorithm can quickly offer relevant insights like \"80% of customers who bought a suit also bought a tie\" so they may try to increase tie sales by offering a discount to clients buying a suit . Of course, a complex classification algorithm will do better at identifying the customers who bought a tie by taking into account more features, but is that really useful for the supermarket?\n#\n# *Decision Trees* can also help a lot when we need to understanding the data. A good example is the traditional problem of classifying Iris flowers included in the [sklearn documentation][5], were we can learn about the characteristics of each flower type in the resulting tree. Given their transparency and relatively low computational cost, *Decision Trees* are also very useful for exploring your data before applying other algorithms. They're helpful for checking the quality of engineered features and identifying the most relevant ones by visualising the resulting tree.\n#\n# The main downsides of *Decision Trees* are their tendency to over-fit, their inability to grasp relationships between features, and the use of greedy learning algorithms (not guaranteed to find the global optimal model). Using them in a [*Random Forest*][6] helps mitigate some of this issues.\n#\n# After this short introduction to *Decision Trees* and their place in Machine Learning, let's see how to apply them for the Titanic challenge. First, we're going to prepare the dataset and discuss the most relevant features. We'll then find the best tree depth to avoid over-fitting, generate the final model, and explain how to visualise the resulting tree.\n#\n#\n# [1]: https://en.wikipedia.org/wiki/Deep_learning\n# [2]: https://en.wikipedia.org/wiki/Ensemble_learning\n# [3]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [4]: https://en.wikipedia.org/wiki/Apriori_algorithm\n# [5]: http://scikit-learn.org/stable/modules/tree.html\n# [6]: https://en.wikipedia.org/wiki/Random_forest\n\n# ## Preparing the Titanic dataset ##\n#\n# For the Titanic challenge we need to guess wheter the individuals from the *test* dataset had survived or not. But for our current purpose let's also find out what can the data tell us about the shipwreck with the help of a *Classification Tree*. Let's load the data and get an overview.\n\n#%%\n\n# Imports needed for the script\nfrom PIL import Image, ImageDraw, ImageFont\nfrom subprocess import check_call\nfrom IPython.display import Image as PImage\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import KFold\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import tree\nimport plotly.tools as tls\nimport plotly.graph_objs as go\nimport plotly.offline as py\nimport numpy as np\nimport pandas as pd\nimport re\nimport xgboost as xgb\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('matplotlib', 'inline')\n\npy.init_notebook_mode(connected=True)\n\n\n# Loading the data\ntrain = pd.read_csv('../input/train.csv')\ntest = pd.read_csv('../input/test.csv')\n\n# Store our test passenger IDs for easy access\nPassengerId = test['PassengerId']\n\n# Showing overview of the train dataset\ntrain.head(3)\n\n\n# Thanks to this overview we can see that our dataset needs some treatment. The class *Survived* is already in binary format so no additional formatting is necessary, but features like *Name*, *Ticket* or *Cabin* need to be adapted for the problem we're trying to solve, and we can also engineer some new features by merging or regrouping existing ones. There's already extended work on this so we're just using one the best approches out there (credit to [Sina][1], [Anisotropic][2] and also [Megan Risdal][3] for the suggestion of the \"Title\" feature).\n#\n#\n# [1]: https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier\n# [2]: https://www.kaggle.com/arthurtok/titanic/introduction-to-ensembling-stacking-in-python\n# [3]: https://www.kaggle.com/mrisdal/titanic/exploring-survival-on-the-titanic\n\n#%%\n\n# Copy original dataset in case we need it later when digging into interesting features\n# WARNING: Beware of actually copying the dataframe instead of just referencing it\n# \"original_train = train\" will create a reference to the train variable (changes in 'train' will apply to 'original_train')\n# Using 'copy()' allows to clone the dataset, creating a different object with the same values\noriginal_train = train.copy()\n\n# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings\nfull_data = [train, test]\n\n# Feature that tells whether a passenger had a cabin on the Titanic\ntrain['Has_Cabin'] = train[\"Cabin\"].apply(\n lambda x: 0 if type(x) == float else 1)\ntest['Has_Cabin'] = test[\"Cabin\"].apply(lambda x: 0 if type(x) == float else 1)\n\n# Create new feature FamilySize as a combination of SibSp and Parch\nfor dataset in full_data:\n dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1\n# Create new feature IsAlone from FamilySize\nfor dataset in full_data:\n dataset['IsAlone'] = 0\n dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1\n# Remove all NULLS in the Embarked column\nfor dataset in full_data:\n dataset['Embarked'] = dataset['Embarked'].fillna('S')\n# Remove all NULLS in the Fare column\nfor dataset in full_data:\n dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())\n\n# Remove all NULLS in the Age column\nfor dataset in full_data:\n age_avg = dataset['Age'].mean()\n age_std = dataset['Age'].std()\n age_null_count = dataset['Age'].isnull().sum()\n age_null_random_list = np.random.randint(\n age_avg - age_std, age_avg + age_std, size=age_null_count)\n # Next line has been improved to avoid warning\n dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list\n dataset['Age'] = dataset['Age'].astype(int)\n\n# Define function to extract titles from passenger names\n\n\ndef get_title(name):\n title_search = re.search(' ([A-Za-z]+)\\.', name)\n # If the title exists, extract and return it.\n if title_search:\n return title_search.group(1)\n return \"\"\n\n\nfor dataset in full_data:\n dataset['Title'] = dataset['Name'].apply(get_title)\n# Group all non-common titles into one single grouping \"Rare\"\nfor dataset in full_data:\n dataset['Title'] = dataset['Title'].replace(\n ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')\n\n dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n\nfor dataset in full_data:\n # Mapping Sex\n dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n\n # Mapping titles\n title_mapping = {\"Mr\": 1, \"Master\": 2, \"Mrs\": 3, \"Miss\": 4, \"Rare\": 5}\n dataset['Title'] = dataset['Title'].map(title_mapping)\n dataset['Title'] = dataset['Title'].fillna(0)\n\n # Mapping Embarked\n dataset['Embarked'] = dataset['Embarked'].map(\n {'S': 0, 'C': 1, 'Q': 2}).astype(int)\n\n # Mapping Fare\n dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0\n dataset.loc[(dataset['Fare'] > 7.91) & (\n dataset['Fare'] <= 14.454), 'Fare'] = 1\n dataset.loc[(dataset['Fare'] > 14.454) & (\n dataset['Fare'] <= 31), 'Fare'] = 2\n dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3\n dataset['Fare'] = dataset['Fare'].astype(int)\n\n # Mapping Age\n dataset.loc[dataset['Age'] <= 16, 'Age'] = 0\n dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1\n dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2\n dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3\n dataset.loc[dataset['Age'] > 64, 'Age']\n\n#%%", "original_comment": "# Feature selection: remove variables no longer containing relevant information\n", "target_code": "train = train.drop(drop_elements, axis=1)\ntest = test.drop(drop_elements, axis=1)\n", "project_metadata": {"full_name": "adgirish/kaggleScape", "description": null, "topics": [], "git_url": "git://github.com/adgirish/kaggleScape.git", "stars": 8, "watchers": 8, "forks": 4, "created": "2018-04-14T18:52:10Z", "size": 27703, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 34896084, "Python": 26724700, "HTML": 2149297}, "last_updated": "2020-01-26T20:21:29Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']\ntrain = train.drop(drop_elements, axis=1)\ntrain = train.drop(['CategoricalAge', 'CategoricalFare'], axis=1)\ntest = test.drop(drop_elements, axis=1)\n", "model": "natural", "intent": "# Feature selection: remove variables no longer containing relevant information"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n\n'''\nLSTM playground\n'''\n\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n\nbatchedInputMatrix.shape\n\n\nbatchedOutputMatrix.shape\n\n\nbatchedInputMatrix[0, 0]\n\n\nbatchedOutputMatrix[0, 0]\n\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# question: how to overcome stationarity?\n#\n# question: multi-step forward prediction -- https://machinelearningmastery.com/multi-step-time-series-forecasting/\n#\n# question: question why stateful?\n\n# ## Define Hyper-Parameters\n\n#%%\n\nfrom __future__ import print_function\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom keras.layers import Dense, SimpleRNN, LSTM, Dropout\nfrom keras.models import Sequential\nfrom keras.callbacks import EarlyStopping\nfrom keras.callbacks import ModelCheckpoint\nfrom sklearn.model_selection import train_test_split\nget_ipython().run_line_magic('matplotlib', 'notebook')\n\n#%%\n\n'''\nLSTM playground\n'''\n\n#%%\n\ntrainingParams = {}\ntrainingParams['batchSize'] = 128\ntrainingParams['unrolledTimesteps'] = 100\ntrainingParams['inputDimensionality'] = 1\ntrainingParams['forwardSamplesToPredict'] = 10\n\ntrainingParams['trainEpochs'] = 100\n\n\n# ## Generate Target Dataset\n\n#%%\n\nxRange = np.linspace(start=0, stop=25*np.pi, num=2500000)\ntargetData = np.cos(xRange) * xRange/10\n\n#%%\n\nplt.figure()\nplt.plot(targetData)\nplt.show()\n\n\n# ## Generate Training Matrix -- Input/Output Batches\n# aka sequences subsampling -- break up big sequence into batches of unrolled timestep duration\n\n#%%\n\ndef get_batched_data(inputData, trainingParams):\n\n if len(inputData.shape) == 1:\n inputData = np.expand_dims(inputData, 1)\n\n RNNBatchSamples = int(\n trainingParams['batchSize'] * trainingParams['unrolledTimesteps'])\n totalBatchesInDataset = int(len(inputData) / RNNBatchSamples)\n\n batchedInputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['unrolledTimesteps']),\n int(trainingParams['inputDimensionality'])))\n\n batchedOutputMatrix = np.zeros((int(totalBatchesInDataset),\n int(trainingParams['batchSize']),\n int(trainingParams['forwardSamplesToPredict'])))\n\n for iInputDimension in range(trainingParams['inputDimensionality']):\n for iBatch in range(totalBatchesInDataset):\n startIndex = iBatch * RNNBatchSamples\n endIndex = startIndex + RNNBatchSamples\n\n batchedInputMatrix[iBatch, :, :, iInputDimension] = np.reshape(inputData[startIndex:endIndex, iInputDimension],\n (trainingParams['batchSize'],\n trainingParams['unrolledTimesteps']))\n # within a batch\n for iBatchElement in range(trainingParams['batchSize']):\n batchElementEnd = startIndex + \\\n trainingParams['unrolledTimesteps'] * (iBatchElement+1)\n batchElementPredictedTarget = batchElementEnd + \\\n trainingParams['forwardSamplesToPredict']\n batchedOutputMatrix[iBatch,\n iBatchElement] = inputData[batchElementEnd: batchElementPredictedTarget].flatten()\n\n return batchedInputMatrix, batchedOutputMatrix\n\n#%%\n\nbatchedInputMatrix, batchedOutputMatrix = get_batched_data(\n targetData, trainingParams)\n\n#%%\n\nbatchedInputMatrix.shape\n\n#%%\n\nbatchedOutputMatrix.shape\n\n#%%\n\nbatchedInputMatrix[0, 0]\n\n#%%\n\nbatchedOutputMatrix[0, 0]\n\n#%%\n\nplt.figure()\ninputData = batchedInputMatrix[0, 0]\ntargetData = batchedOutputMatrix[0, 0]\nplt.plot(np.arange(len(inputData)), inputData, 'x')\nplt.plot(np.arange(len(inputData), len(inputData) +\n len(targetData)), targetData, 'xr')\nplt.legend(['input', 'prediction target'])\nplt.show()\n\n\n# ## Define Model Architecture\n\n# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/\n#\n# ## Model Statefulness (from Keras documentation)\n#\n# Note on using statefulness in RNNs -- source: https://keras.io/layers/recurrent/\n#\n# You can set RNN layers to be 'stateful', which means that the states computed for the samples in one batch will be reused as initial states for the samples in the next batch. This assumes a one-to-one mapping between samples in different successive batches.\n#\n# To enable statefulness: - specify stateful=True in the layer constructor. - specify a fixed batch size for your model, by passing if sequential model: batch_input_shape=(...) to the first layer in your model. else for functional model with 1 or more Input layers: batch_shape=(...) to all the first layers in your model. This is the expected shape of your inputs including the batch size. It should be a tuple of integers, e.g. (32, 10, 100). - specify shuffle=False when calling fit().\n#\n# To reset the states of your model, call .reset_states() on either a specific layer, or on your entire model.\n#\n\n# ## Simple Dense Model\n\n#%%\n\nprint('creating model')\nsimpleDenseModel = Sequential()\n\n# lstm layer\nsimpleDenseModel.add(Dense(10, input_dim=trainingParams['unrolledTimesteps']))\n\n# output layer\nsimpleDenseModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleDenseModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple non-Stateful LSTM Model\n\n#%%\n\nprint('creating model')\nsimpleNonStatefulModel = Sequential()\n\n# lstm layer\nsimpleNonStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=False))\n# dense layer\nsimpleNonStatefulModel.add(Dense(10, activation='sigmoid'))\n\n# output layer\nsimpleNonStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n\n# compile\nsimpleNonStatefulModel.compile(loss='mse', optimizer='adam')\n\n\n# ## Simple Stateful Model\n\n#%%\n\nprint('creating model')\nsimpleStatefulModel = Sequential()\n\n# lstm layer\nsimpleStatefulModel.add(LSTM(10,\n batch_input_shape=(trainingParams['batchSize'],\n trainingParams['unrolledTimesteps'],\n trainingParams['inputDimensionality']),\n return_sequences=False, # needed in case we stack LSTM layers\n stateful=True))\n# dense layer\nsimpleStatefulModel.add(Dense(10, activation='sigmoid'))", "original_comment": "# output layer\n", "target_code": "simpleStatefulModel.add(\n Dense(trainingParams['forwardSamplesToPredict'], activation='linear'))\n", "project_metadata": {"full_name": "miroenev/teach_DL", "description": null, "topics": [], "git_url": "git://github.com/miroenev/teach_DL.git", "stars": 36, "watchers": 36, "forks": 15, "created": "2017-07-19T18:01:29Z", "size": 98182, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 12259106, "Python": 43930, "Dockerfile": 2478, "Shell": 1713}, "last_updated": "2020-09-04T16:13:54Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "precision": "Agree", "precision-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "usefulness": "Strongly agree", "usefulness-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "simpleDenseModel.add(Dense(1, activation='linear'))\nsimpleDenseModel.summary()\n", "model": "natural", "intent": "# add output layer"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# \n#\n# ### Lab 04 - \"Financial Data Science: Mean Reversion Trading Strategies\"\n#\n# Chartered Financial Data Scientist (CFDS), Autumn Term 2020\n\n# In this introductory lab, we create our first **financial data science process**. The main objective of this lab is to walk you through the general process of implementing and evaluating a simple **mean-reversion** trading strategy. To achieve this, we will follow the distinct process steps as outlined below:\n\n# \n\n# As always, pls. don't hesitate to ask all your questions either during the lab or send us an email (using our\n# fds.ai email addresses).\n\n# ### Lab Objectives:\n\n# After today's lab you should be able to:\n#\n# > 1. Implement a **mean-reversion trading strategy** and apply it to distinct financial instruments.\n# > 2. Convert the trading strategy results into **trade signals** to be used in backtest.\n# > 3. Understand how to use the **python backtesting bt** library to backtest the implemented strategy.\n# > 4. Interpret the backtests results using the distinct **backtest performance** measures.\n\n# Before we start let's watch a motivational video:\n\n#%%\n\nimport warnings\nfrom IPython.display import YouTubeVideo\n# Nvidia GTC 2017: \"I Am AI\" Opening in Keynote\"\n# YouTubeVideo('SUNPrR4o5ZA', width=800, height=600)\n\n\n# ### Setup of the Analysis Environment\n\n# We need to import a couple of Python libraries that allow for data analysis and data visualization. In this lab will use the `Pandas`, `NumPy`, `BT` and the `Matplotlib` library. Let's import the libraries by the execution of the statements below:\n\n#%%\n\n# import python utility libraries\nimport os as os\nimport datetime as dt\nimport itertools as it\n\n# import python data science libraries\nimport pandas as pd\nimport numpy as np\n\n# import the pandas financial data reader library\nimport pandas_datareader as dr\n\n# import the Python bt backtest library\nimport bt as bt\n\n# import the matplotlib and seaborn visualization library\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n\n# Let's also set a couple of general plot parameters:\n\n#%%\n\n# set general plot parameters\nplt.style.use('seaborn')\nplt.rcParams['figure.figsize'] = [10, 5]\nplt.rcParams['figure.dpi'] = 150\n\n\n# Enable inline Jupyter notebook plotting:\n\n#%%\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# Suppress potential warnings due to recent library enhancements:\n\n#%%\n\nwarnings.filterwarnings('ignore')\n\n\n# Create a **dataset** sub-folder that we will use to store the financial data downloaded:\n\n#%%\n\nif not os.path.exists('./datasets'):\n os.makedirs('./datasets')\n\n\n# ### 1. Acquire the Financial Data\n\n# In this section of the lab notebook, we will aquire historic daily stock market data of the **Euro vs. US-Dollar** foreign exchange rate (ticker symbol: \"EURUSD\"). Thereby, we will utilize the `datareader` of the `Pandas` library that provides the ability to interface the `Yahoo` finance API. Let's first specify the start date and end date of the data download. We aim to download the exchange rate data starting from the **31.12.2003** until the **31.12.2017** to develop and evaluate a simple mean-reversion trading strategy:\n\n#%%\n\n# set to start and end date of the data download\nstart_date = dt.datetime(2003, 12, 31)\nend_date = dt.datetime(2017, 12, 31)\n\n\n# Download the **daily** \"Euro vs. USD\" exchange rate data of the defined timeframe using the `datareader`'s `Yahoo` finance API:\n\n#%%\n\n# download eurusd exchange rate data\neurusd_data = dr.data.DataReader(\n 'EURUSD=X', data_source='yahoo', start=start_date, end=end_date)\n\n\n# ### 2. Pre-Process the Financial Data\n\n# Inspect the top 10 records of the `EURUSD` data downloaded:\n\n#%%\n\neurusd_data.head(10)\n\n\n# Visually inspect the **adjusted closing price** of the downloaded `EURUSD`\n# data:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot eurusd daily adjusted closing prices\nax.plot(eurusd_data.index, eurusd_data['Adj Close'], color='#9b59b6')\n\n# rotate x-ticks\nfor tick in ax.get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax.set_xlabel('[time]', fontsize=10)\nax.set_xlim([start_date, end_date])\nax.set_ylabel('[adjusted closing price]', fontsize=10)\n\n# set plot title\nplt.title('Euro vs. US-Dollar Exchange Rate - Historical Prices', fontsize=10)\n\n\n# Save the downloaded `EURUSD` data to the local directory:\n\n#%%\n\neurusd_data.to_csv('./datasets/eurusd_data_2003_2017_daily.csv',\n sep=';', encoding='utf-8')\n\n\n# ### 3. Data Analysis - Mean Reversion Strategy Implementation\n\n# Let's implement a simple **Mean Reversion** trading strategy. In general, **mean reversion trading** refers to the idea that extreme market movements are more likely followed by an \"average movement\" than by an even more extreme market movement.\n#\n# Mean reversion trading is derived from the observation that the price of financial instruments tend to **revert to their mean price** over time. It is assumed, that the price of a financial instrument is prone to random fluctuations around an underlying (potentially) stable trend. This behaviour can be frequently observed when analyzing price charts of foreign exchange rates such as the EUR to JPY fx-rate, as observable in the following illustration:\n\n# \n\n# In the context of mean reversion trading it is aimed to trade such (tiny) fluctuations around such more stable trends. To achieve this will, we apply a technique referred to as **\"Bollinger Bands\"** proposed by John Bollinger in the 1980's. Bollinger Bands characterize the price volatility of a financial instrument over time. In general, the bands are determined by:\n#\n#\n# $$BB^{upper}(t, n, k) = SMA(t, n) + k \\cdot \\sigma(t)$$\n#\n# $$BB^{lower}(t, n, k) = SMA(t, n) - k \\cdot \\sigma(t)$$\n#\n#\n# where $t$ denotes the current point in time and the distinct elements of the Bollinger Bands calculation correspond to:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window;\n# >- $BB^{upper}(t, n, k)$ denotes the **upper Bollinger Band** defined by adding $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ to the simple moving average $SMA(t, n)$; and,\n# >- $BB^{lower}(t, n, k)$ denotes the **lower Bollinger Band** defined by subtracting $k$-times the positive standard deviation $\\sigma_i$ of the $n$ historical prices $p_i$ from the simple moving average $SMA(t, n)$.\n#\n# The following illustration shows the calculated Bollinger Bands $BB^{upper}$ and $BB^{lower}$ at distinct timesteps $t$ and different $k$ parametrizations:\n\n# \n\n# Let's start inspect the Bollinger Bands of the mean-reversion trading strategy by setting the moving average window lookback size:\n\n#%%\n\n# set the mean-reversion moving average indicator lookback, days = 20\nmre_lookback_days_20 = 20\n\n\n# We can calculate the moving (rolling) average using the Pandas `rolling()` and `mean()` function:\n\n#%%\n\nmre_mav_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).mean(), name='SMAV_20')\n\n\n# Similarly, we can calculate the moving (rolling) standard deviation $\\sigma$ using the Pandas `rolling()` and `std()` function:\n\n#%%\n\nmre_std_20 = pd.Series(eurusd_data['Adj Close'].rolling(\n window=mre_lookback_days_20).std(), name='STD_20')\n\n\n# Merge the obtained rolling moving average and standard deviation values with the original echange rate price data (adjusted closing prices):\n\n#%%\n\nmre_mav_eurusd_data = eurusd_data.join(mre_mav_20)\nmre_mav_eurusd_data = mre_mav_eurusd_data.join(mre_std_20)\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well as the derived (i) moving average and (ii) standard deviation values starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20']].iloc[20:30]\n\n\n# To gain an even more detailed intuition let's determine and visualize different degrees of rolling standard deviation obtainable from the 20-day moving average price. Obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 1$:\n\n#%%\n\n# one standard deviations\nmre_mav_eurusd_data['POS_STD1_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD1_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (1.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Similarly, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 2$:\n\n#%%\n\n# two standard deviations\nmre_mav_eurusd_data['POS_STD2_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD2_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (2.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# And finally, obtain a rolling adjusted closing price standard deviation of $\\sigma = \\pm 3$:\n\n#%%\n\n# three standard deviations\nmre_mav_eurusd_data['POS_STD3_20'] = mre_mav_eurusd_data['Adj Close'] + \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\nmre_mav_eurusd_data['NEG_STD3_20'] = mre_mav_eurusd_data['Adj Close'] - \\\n (3.0 * mre_mav_eurusd_data['STD_20'])\n\n\n# Inspect and validate the daily adjusted closing prices of the EURUSD exchange rate as well the different degrees of deviating standard deviations starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'POS_STD1_20', 'NEG_STD1_20',\n 'POS_STD2_20', 'NEG_STD2_20', 'POS_STD3_20', 'NEG_STD3_20']].iloc[20:30]\n\n\n# Plot the historical daily adjusted closing prices of the EUR vs. US-Dollar (blue) as well as its rolling 20 days standard deviations of $\\sigma=1$ standard deviations (top), $\\sigma=2$ standard deviations (middle) as well as $\\sigma=3$ standard deviations (bottom):\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 15]\nfig, ax = plt.subplots(ncols=1, nrows=3)\n\n# plot the standard deviation of 1\n\n# plot moving average adjusted closing price standard deviation of 1\nax[0].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD1_20'],\n mre_mav_eurusd_data['NEG_STD1_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 1$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[0].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[0].set_xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[0].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 2\n\n# plot moving average adjusted closing price standard deviation of 2\nax[1].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD2_20'],\n mre_mav_eurusd_data['NEG_STD2_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 2$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[1].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[1].set_xlabel('[time]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[1].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# plot the standard deviation of 3\n\n# plot moving average adjusted closing price standard deviation of 3\nax[2].fill_between(mre_mav_eurusd_data.index, mre_mav_eurusd_data['POS_STD3_20'],\n mre_mav_eurusd_data['NEG_STD3_20'], color='C2', lw=2.0, label='$Stdv. \\sigma = 3$ (red)', alpha=0.3)\n\n# plot adjusted closing price\nax[2].plot(mre_mav_eurusd_data['Adj Close'], lw=1.0,\n color='C3', label='Adj. Closing Prices (purple)')\n\n# rotate x-tick labels\nfor tick in ax[2].get_xticklabels():\n tick.set_rotation(45)\n\n# set axis labels\nax[2].set_xlabel('[time]', fontsize=10)\nax[2].set_xlim([start_date, end_date])\nax[2].set_ylabel('[market price]', fontsize=10)\n\n# set plot legend\nax[2].legend(loc=\"upper left\", numpoints=1, fancybox=True)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=1$', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=2$', fontsize=10)\nax[2].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Historical Prices, $\\sigma=3$', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# ### 4. Mean Reversion Trading Signal Generation\n\n# We will make use of the **\"Standard-Score\"** or **\"Z-Score\"** to convert the Bollinger Band information into a series of binary long- and short-trading-signals of a mean reversion trading strategy. The **\"Z-Score\"** is the signed number of standard deviations by which the actual price $p_{i}(t)$ of a financial instrument $i$ falls above or below the moving average price, formally denoted by:\n\n# $$ z_{i}(t) = \\frac{p_{i}(t)-SMA_{i}(t,n)}{\\sigma_{i}(t, n)}$$\n\n# where $t$ denotes the current point in time and the distinct elements of the Z-Score are defined by:\n#\n# >- $SMA(t, n)$ denotes a **simple moving average** with a lookback window of $n$ historical prices $p_i$ defined by $SMA(t, n)=\\frac{1}{n} \\sum_{k=0}^{n-1} p_{i}(t-k)$, e.g. a 20, 50, or 100-days moving average lookback window; and,\n# >- $\\sigma_{i}(t, n)$ denotes the **moving average strandard deviation** with a lookback window of $n$ historical prices $p_i$, e.g. a 20, 50, or 100-days moving average lookback window.\n\n# Let's now determine the Z-Score at distinct time steps of the EUR vs. US-Dollar foreign exchange rate:\n\n#%%\n\nmre_mav_eurusd_data['Z_SCORE'] = (\n mre_mav_eurusd_data['Adj Close'] - mre_mav_eurusd_data['SMAV_20']) / mre_mav_eurusd_data['STD_20']\n\n\n# Inspect and validate the different rolling Z scores obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20', 'STD_20', 'Z_SCORE']].iloc[20:30]\n\n\n# Let's now derive a mean-reversion trading signal from the calculated rolling Z-Score of the EUR vs. US-Dollar foreign exchange rate. In order to derive such a signal we first specify an upper Z-Score threshold $\\alpha$ and a lower Z-Score threshold $\\beta$, where $\\alpha > \\beta$. Afterwards, we are able to derive a mean-reversion trading signal according to the following rules:\n#\n# >- **\"Long-signal\"** (+1.0) signal if $z_{i}(t) \\leq -\\; \\alpha \\cdot z_{i}(t)$;\n# >- **\"Close Long-signal\"** (0.0) signal if $z_{i}(t) \\leq -\\; \\beta \\cdot z_{i}(t)$;\n# >- **\"Short-signal\"** (+1.0) signal if $z_{i}(t) \\geq +\\; \\alpha \\cdot z_{i}(t)$; and,\n# >- **\"Close Short-signal\"** (0.0) signal if $z_{i}(t) \\geq +\\; \\beta \\cdot z_{i}(t)$.\n#\n# Let's now start to determine the mean-reversion trading signals by setting the Z-Score thresholds. Thereby, we will set both Z-Score thresholds $\\alpha = 1.0$ and $\\beta = 0.5$ respectively, as done in the following:\n\n#%%\n\nz_score_alpha_threshold = 1.0\nz_score_beta_threshold = 0.5\n\n\n# Subsequently we implement and derive the mean-reversion trading signals of the EUR vs. US-Dollar foreign exchange rate using both Z-Score thresholds as defined above:\n\n#%%\n\n# determine the distinct z-scores\nz_scores = mre_mav_eurusd_data['Z_SCORE']\n\n# init mean reversion signal\nmre_trade_signal = np.zeros(len(z_scores))\n\n# iterate over z-scores\nfor i in range(20, len(z_scores)):\n\n # determine actual z-score\n z_score = z_scores[i]\n\n # case: active trading signal\n if mre_trade_signal[i-1] == 0.0:\n\n # case: z-score exceeds positive threshold\n if z_score > z_score_alpha_threshold:\n\n # set 'short' signal\n mre_trade_signal[i] = -1.0\n\n # case: z-score exceeds negative threshold\n elif z_score < (z_score_alpha_threshold * -1.0):\n\n # set 'long' signal\n mre_trade_signal[i] = 1.0\n\n # case: z-score doesn't exceed thresholds\n else:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n # case: inactive trading signal\n elif mre_trade_signal[i-1] != 0.0:\n\n # z-score reverted back to moving average\n if abs(z_score) < z_score_beta_threshold:\n\n # set 'neutral' signal\n mre_trade_signal[i] = 0.0\n\n # z-score not yer reverted back to moving average\n elif abs(z_score) > z_score_beta_threshold:\n\n # keep prior signal\n mre_trade_signal[i] = mre_trade_signal[i-1]\n\n\n# Convert the obtained trading signals into a Pandas DataFrame and merge it with the market price data:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_data_signal = pd.DataFrame(\n mre_trade_signal, columns=['SIGNAL_20'], index=mre_mav_eurusd_data.index)\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_data['SIGNAL_20'] = mre_mav_eurusd_data_signal\n\n\n# Inspect and validate the different Z scores and **mean-reversion trading strategy signals** obtained, starting from the first obtained 20-day moving average price:\n\n#%%\n\nmre_mav_eurusd_data[['Adj Close', 'SMAV_20',\n 'STD_20', 'Z_SCORE', 'SIGNAL_20']].iloc[20:30]\n\n\n# In addition, let's also prepare a backtest of a **\"baseline\"** in terms of a simple **buy-and-hold** trading strategy for comparison purposes. Our buy-and-hold strategy sends a \"long\" (+1.0) signal for each time step:\n\n#%%\n\nmre_mav_eurusd_data['SIGNAL_BASE'] = 1.0\n\n\n# Prepare the trading signal data to be utilized in backtesting the mean-reversion trading strategy:\n\n#%%\n\n# convert signals to Pandas DataFrame\nmre_mav_eurusd_signal_data = pd.DataFrame(\n mre_mav_eurusd_data[['SIGNAL_20', 'SIGNAL_BASE']], columns=['SIGNAL_20', 'SIGNAL_BASE'])\n\n# convert pandas DataFrame index to datatype: datetime\nmre_mav_eurusd_signal_data = mre_mav_eurusd_signal_data.set_index(\n pd.to_datetime(mre_mav_eurusd_signal_data.index))\n\n\n# Inspect top 10 rows of the prepared trading signals:\n\n#%%\n\nmre_mav_eurusd_signal_data.head(10)\n\n\n# Inspect some of the exemplary signal deviations between the **mean-reversion** and our baseline **buy and hold** trading strategies:\n\n#%%\n\nmre_mav_eurusd_signal_data[mre_mav_eurusd_signal_data['SIGNAL_20']\n != mre_mav_eurusd_signal_data['SIGNAL_BASE']].head(10)\n\n\n# Visualize the prepared trading signals:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 5]\nfig, ax = plt.subplots(ncols=1, nrows=2)\n\nax[0].plot(mre_mav_eurusd_signal_data['SIGNAL_20'],\n lw=1.0, color='C2', label='SMAV 16 (red)')\nax[1].plot(mre_mav_eurusd_signal_data['SIGNAL_BASE'],\n lw=1.0, color='C3', label='BASE (purple)')\n\n# set axis labels\nplt.xlabel('[time]', fontsize=10)\nax[0].set_xlim([start_date, end_date])\nax[0].set_ylabel('[mre 100 signal]', fontsize=10)\nax[1].set_xlim([start_date, end_date])\nax[1].set_ylabel('[base signal]', fontsize=10)\n\n# rotate the x-axis labels\nfor tick in ax[0].get_xticklabels():\n tick.set_rotation(45)\n\nfor tick in ax[1].get_xticklabels():\n tick.set_rotation(45)\n\n# set plot title\nax[0].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Mean Reversion Trading Signals', fontsize=10)\nax[1].set_title(\n 'Euro vs. US-Dollar Exchange Rate - Baseline Buy and Hold Trading Signals', fontsize=10)\n\n# reset plot layout\nplt.tight_layout()\n\n\n# Let's determine the total number of **long-short signal changes** of the distinct trading strategies:\n\n#%%\n\n# signal changes of the mean-reversion trading strategy\nlen(list(it.groupby(mre_mav_eurusd_signal_data['SIGNAL_20'], lambda x: x > 0)))\n\n#%%\n\n# signal changes of the baseline buy and hold trading strategy\nlen(list(it.groupby(\n mre_mav_eurusd_signal_data['SIGNAL_BASE'], lambda x: x > 0)))\n\n\n# ### 5. Mean Reversion Signal Backtest\n\n# Prepare the market data to be utilized in backtesting the mean reversion trading strategy:\n\n#%%\n\n# extract the eurusd exchange rate closing prices\neurusd_market_data = pd.DataFrame(\n eurusd_data['Adj Close'], columns=['Adj Close'])\n\n# rename the 'close' column to 'eurusd' (since this is the column we want to allocate to in the backtest)\neurusd_market_data = eurusd_market_data.rename(columns={'Adj Close': 'EURUSD'})\n\n# convert pandas DataFrame index to datatype: datetime\neurusd_market_data = eurusd_market_data.set_index(\n pd.to_datetime(eurusd_data.index))\n\n\n# Inspect top 10 rows of the prepared market data:\n\n#%%\n\neurusd_market_data.head(10)\n\n\n# Implementing a simple Mean Reversion Trading Strategy by interfacing the Python `bt`'s Algo class:\n\n#%%\n\nclass MeanReversionStrategy(bt.Algo):\n\n # inits the strategy\n def __init__(self, signals):\n\n # set class signals\n self.signals = signals\n\n # calss the trading strategy\n def __call__(self, target):\n\n # case: current timestep in signals\n if target.now in self.signals.index[1:]:\n\n # get actual signal\n signal = self.signals[target.now]\n\n # set target weights according to signal\n target.temp['weights'] = dict(EURUSD=signal)\n\n # return 'True' since we want to move on to the next timestep\n return True\n\n\n# Define the moving average trading strategy backtest algorithm stack.\n#\n# **Note:** That in the Python `bt` library a trading strategy usually consists of a so-called **stack of algorithms**. For each timestep of our backtest timeframe, the `bt` library executes all algorithm of the stack in sequential order. Each moving average strategy we aim to design and backtest consists in total of three algorithms, briefly described in the following:\n#\n# > 1. `bt.algos.SelectAll()`: Selects all available stocks for trading except stock prices that correspond to NaN or 0.00.\n# > 2. `MovingAverageStrategy()`: Assigns the calculated signal in terms of a weight value to the EUR vs. USD exchange rate.\n# > 3. `bt.algos.Rebalance()`: Rebalances the available capital based on the weights assigned to each stock.\n\n# Define the mean-reversion and buy-and-hold trading strategy backtest algorithm stack:\n\n#%%\n\nmre_mav_eurusd_strategy_20 = bt.Strategy(name='mre_20', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_20']), bt.algos.Rebalance()])\nmre_mav_eurusd_strategy_base = bt.Strategy(name='base', algos=[bt.algos.SelectAll(\n), MeanReversionStrategy(mre_mav_eurusd_signal_data['SIGNAL_BASE']), bt.algos.Rebalance()])\n\n\n# Let's now define the trading ('fees') commissions used in each rebalancing time-step of a backtest. To achieve this, the `bt` library expects a callable function that expects the following two parameters as an input:\n#\n# > - the 'quantity', denoted by `q`, of rebalanced assets at a backtest time-step;\n# > - the 'price', denoted by `p`, of rebalanced assets at a backtest time-step.\n#\n# Let's implement such a callable function defining a trading fee of **1\\% (0.01)** per quantity of rebalanced asset (or a flat fee of **USD 5.00** per trade):\n\n#%%\n\n# init trading fees function\ndef trading_fees_function(q, p):\n\n # calcluate trading fees (rebalanced-quantity * trading-fee)\n fees = 5.00 # flat fee of USD 5.00 per trade\n\n # return the total trading fees\n return fees\n\n\n# Upon completion of defining the mean-reversion strategy let's now init the corresponding backtests using (1) both strategies as well as (2) the market data that we aim to evaluate during the backtest:\n\n#%%\n\neurusd_backtest_mre_mav_20 = bt.Backtest(strategy=mre_mav_eurusd_strategy_20, data=eurusd_market_data,\n name='eurusd_backtest_mre_20', commissions=trading_fees_function, progress_bar=True)\neurusd_backtest_mre_mav_base = bt.Backtest(strategy=mre_mav_eurusd_strategy_base, data=eurusd_market_data,\n name='eurusd_backtest_mre_base', commissions=trading_fees_function, progress_bar=True)\n\n\n# Now, let's run the backtest of the mean-reversion trading strategy configuration as well as the defined baseline:\n\n#%%\n\nbacktest_results_eurusd = bt.run(\n eurusd_backtest_mre_mav_20, eurusd_backtest_mre_mav_base)\n\n\n# Inspect the individual backtest results and performance measures:\n\n#%%\n\nbacktest_results_eurusd.display()\n\n\n# Collect detailed backtest performance per timestep of the **mean-reversion** strategy:\n\n#%%\n\nbacktest_mre_20_eurusd_details = eurusd_backtest_mre_mav_20.strategy.prices.to_frame(\n name='Rel. EQUITY')\n# equity per timestep\nbacktest_mre_20_eurusd_details['Abs. EQUITY'] = eurusd_backtest_mre_mav_20.strategy.values\n# cash per timestep\nbacktest_mre_20_eurusd_details['CASH'] = eurusd_backtest_mre_mav_20.strategy.cash\n# positions per timestep\nbacktest_mre_20_eurusd_details['POSITIONS'] = eurusd_backtest_mre_mav_20.strategy.positions\n# trading fees per timestep\nbacktest_mre_20_eurusd_details['FEES'] = eurusd_backtest_mre_mav_20.strategy.fees\n\n\n# Inspect detailed backtest results per timestep:\n\n#%%\n\nbacktest_mre_20_eurusd_details.head(10)\n\n\n# Visualize the monthly returns obtained by the **mean-reversion** trading strategy:\n\n#%%\n\nplt.rcParams['figure.figsize'] = [15, 10]\nfig = plt.figure()\nax = fig.add_subplot(111)\n\n# plot heatmap of monthly returns generated by the strategy\nax = sns.heatmap(eurusd_backtest_mre_mav_20.stats.return_table,\n annot=True, cbar=True, vmin=-0.5, vmax=0.5)", "original_comment": "# set axis labels\n", "target_code": "ax.set_xlabel('[month]', fontsize=10)\nax.set_ylabel('[year]', fontsize=10)\n", "project_metadata": {"full_name": "financial-data-science/CFDS-Notebooks", "description": "A series of interactive labs we prepared for the Chartered Financial Data Scientist Certification. The content of the series is based on Python, IPython Notebook, and PyTorch.", "topics": ["financial-data-analysis", "financial-data-science", "financial-machine-learning"], "git_url": "git://github.com/financial-data-science/CFDS-Notebooks.git", "stars": 4, "watchers": 4, "forks": 1, "created": "2020-10-20T19:38:53Z", "size": 35533, "license": "bsd-3-clause", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 1327604}, "last_updated": "2020-12-16T11:38:43Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "plt.gca().set_axis_bgcolor('white')\nbacktest_results_eurusd.plot.bar(\n subplots=True, yticks=range(backtest_results_eurusd.shape[0]))\n", "model": "docstring", "intent": "# set axis labels"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport warnings\nimport util\nfrom pathlib import Path\nfrom fastai.vision import *\nfrom fastai import *\nget_ipython().run_line_magic('reload_ext', 'autoreload')\nget_ipython().run_line_magic('autoreload', '2')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n#%%\n\nwarnings.filterwarnings(action='once')\n\n\n# ## User Params\n\n#%%\n\nbasepath = Path('/home/jupyter/data')\ndataset_name = 'uo_dress'\nn_epoch = 100\n\n\n# ## Derived Params and Helpers\n\n#%%\n\npath_img = basepath/'imagenet_style'/dataset_name\nlabels_file = basepath/dataset_name/'labels.csv'\n\n#%%\n\ndef create_learner(data, savename='dummy', patience=8, modeltype=models.resnet34):\n learn = create_cnn(data,\n modeltype,\n metrics=error_rate,\n callback_fns=[partial(SaveModelCallback, name=savename),\n partial(EarlyStoppingCallback,\n patience=patience),\n ShowGraph])\n return learn\n\n\n# ## Create dataset\n\n#%%\n\ndata = ImageDataBunch.from_folder(path_img,\n train='train',\n valid='valid',\n test='test',\n ds_tfms=get_transforms(),\n size=224)\ndata.normalize(imagenet_stats)\n\n#%%\n\nprint(data.classes)\n\n\n# ## Display dataset\n\n#%%\n\ndata.show_batch(rows=6)\n\n\n# ## Train a standard transfer learning model (stage 1)\n\n#%%\n\nlearn = create_learner(data, 'stage1-bestmodel')\nlearn.fit_one_cycle(n_epoch)\n\n\n# ### Show results from generic evaluation tool\n\n#%%\n\nresults_val = util.eval_rollup(labels_file, learn, evalset='val')\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ### Show results using FastAI to confirm matches\n\n#%%\n\n# Confirm fastai reporting matching the generic one used for other platforms\n# Some formatting work needed below to run on test and not validation dataset\n# Convert strings to numbers (tensor)\ny_true_num = tensor([data.classes.index(z) for z in y_true])\npreds, y, losses = learn.get_preds(with_loss=True, is_test=True)\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n\n\n# Fastai confusion matrix matches the generic one for the test dataset.\n\n# ## Unfreezing, fine-tuning, and learning rates\n\n# First let's try just unfreezing the previously trained model and continue running some more training to see how well it can do.\n\n#%%\n\nlearn = create_learner(data, 'unfreeze-bestmodel')\nlearn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\n\n#%%\n\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\n\n\n# ## Unfreeze but use identified learning rates\n\n# Now let's go back to the original model that only trained the head again.\n# But this time, we'll find the best learning rate to use for this, and then use those learning rates with the unfrozen model.\n\n# First, find and plot the learning rates using lr_find.\n\n#%%\n\nlearn = create_learner(data)\nlearn.load('stage1-bestmodel')\n# Need to flush callbacks or else lr_find crashes\nlearn.callback_fns = [Recorder]\nlearn.lr_find()\nlearn.recorder.plot()\n\n\n# Now that we have the learning rate range, let's feed that into the model and rerun\n\n#%%", "original_comment": "# Reload with new savename\n", "target_code": "learn = create_learner(data, 'stage2-bestmodel')\nlearn.load('stage1-bestmodel')\n", "project_metadata": {"full_name": "URBNOpenSource/custom-vision-study", "description": null, "topics": [], "git_url": "git://github.com/URBNOpenSource/custom-vision-study.git", "stars": 5, "watchers": 5, "forks": 4, "created": "2019-03-12T20:31:02Z", "size": 19785, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 5009642, "Python": 5509, "Shell": 928}, "last_updated": "2019-10-24T13:27:26Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Disagree", "usefulness-score": 1, "compatibility": "Agree", "compatibility-score": 2, "precision": "Agree", "precision-score": 2, "coverage": "Disagree", "coverage-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "learn.load('stage1-bestmodel')\nlearn.unfreeze()\nlearn.fit_one_cycle(n_epoch)\nresults_test, y_true, y_pred, scores, class_labels = util.eval_rollup(\n labels_file, learn, evalset='test')\ninterp = ClassificationInterpretation(data, preds, tensor(y_true_num), losses)\ninterp.plot_top_losses(15, figsize=(15, 11))\ninterp.plot_confusion_matrix(figsize=(6, 6), dpi=60)\n", "model": "docstring", "intent": "# Reload with new savename"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n\ndf = df_orig.copy()\n\n\ndf.info()\n\n\ndf.head(5)\n\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n\n# Use function to add in indicators for presence of null values\n\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n\nfloat(9.00000).is_integer()\n\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n\n\n\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n\n# ?np.where\n\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n\ndf.head(5)\n\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n\ndf['MasVnrType'].value_counts()\n\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n\ndf[(df.BsmtQual_missing == True)]\n\n\ndf.BsmtCond.value_counts()\n\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n\ndf.Electrical.value_counts()\n\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n\n# ## 2. Create additional bespoke data features\n\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n\ndf['Functional'].value_counts()\n\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n\ndf['PoolQC'].value_counts()\n\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n\ndf['Condition1'].value_counts()\n\n\ndf['Condition2'].value_counts()\n\n\ndf['Condition1']\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n\ndf.head(10)\n\n\n# ***\n# ## 4. Set up target encoding parameters\n\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n\ndf_oe.head(5)\n\n\n# ***\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Data Preperation\n\n# * This notebook contains the detailed working and testing for data preparation.\n# * All the contents of the summarised steps are included in the modelling workbook\n# * Further data features may have been added in the modelling phase. This was just the cleaning and set up I thought was necessary as a starting point prior to modelling.\n#


\n# Overall steps for data preparation will be:\n#\n# 0. Import modules and initialise data frame\n# 1. Deal with any null values\n# 2. Create additional bespoke data features\n# 3. Create manual OneHotEncoding\n# 4. Design code for target_encoded columns\n# 5. Design code for ordinal_encoded columns\n# 6. Design code for onehot encoded columns\n# 7. Run individual code sets and expected modelling data set (noting params in pipeline that may change)\n#

\n#\n# Originally had a step:\n# *Extract file for use in model pipeline (enables target encoding parameters to be manipulated)*\n#\n# Decided to remove this step since I thought it would just complicate adding further features once I was in the modelling phase.\n\n# ## 0. Import modules and data set, adjust pandas settings\n#\n\n#%%\n\nimport numpy as np\nimport pandas as pd\nimport category_encoders as ce\nimport sklearn.pipeline as pipeline\n\n#%%\n\npd.set_option('display.max_rows', None)\npd.set_option('display.max_columns', None)\n\n#%%\n\ndf_orig = pd.read_csv(\n r\"C:\\Users\\Jonat\\ga\\Material\\Unit 3\\homework\\data\\iowa_full.csv\")\n\n#%%\n\ndf = df_orig.copy()\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head(5)\n\n#%%\n\n# ***\n\n# ## 1. Deal with any null values\n#\n# Below section steps through logic and checks. See summary at the end for all adjustments in a single point.\n#\n\n#%%\n\n# Use function to add in indicators for presence of null values\n\n#%%\n\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n#%%\n\ndf = denote_null_values(df)\n\n#%%\n\ndf.info()\n# This shwos an additional 19 \"_missing\" columns so the function work properly.\n\n\n# ***\n\n#%%\n\n# LotFrontage - replace nulls using average for the neighbourhood.\n# get a DF to join to the data set as a new column\nlotfrontage_neighborhood_mean = df.groupby(by=['Neighborhood'])[\n ['LotFrontage']].mean().reset_index()\nlotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\nlotfrontage_neighborhood_mean\n\n#%%\n\ndf = df.merge(lotfrontage_neighborhood_mean, how='left',\n left_on='Neighborhood', right_on='Neighborhood')\n\n#%%\n\ndf['LotFrontage'] = df['LotFrontage'].fillna(df.LotFrontage_Neighborhood_Mean)\n\n#%%\n\ndf.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n#%%\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n#%%\n\n# Test the functions above\ntrain = df.sample(frac=0.3, random_state=743)\ntest = df.drop(train.index)\ntrain, val = train.iloc[:-100], train.iloc[-100:]\n\n#%%\n\nprint(\n f\"train size {train.shape[0]} and test size {test.shape[0]} and val size {val.shape[0]}\")\nprint(\n f\"total size {df.shape[0]} and check size {train.shape[0] + test.shape[0] + val.shape[0]}\")\n\n#%%\n\ntrain, test, val = LotFrontage_na_apply(train, test, val)\n\n#%%\n\nfloat(9.00000).is_integer()\n\n#%%\n\n# Exclude the numbers that end evenly (i.e. original data), and look at results\n# Then compare with same code for the test set\n# Realised after could have just used LotFrontage_missing!; Probably simpler and clearer\n# train[~(train['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntrain[(train.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\ntrain[(train['Neighborhood'] == 'BrkSide')]['LotFrontage'].mean()\n\n#%%\n\n#test[~(test['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\ntest[(test.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n#val[~(val['LotFrontage'].apply(lambda x: x.is_integer()))].groupby(by='Neighborhood')['LotFrontage'].value_counts()\nval[(val.LotFrontage_missing == True)].groupby(\n by='Neighborhood')['LotFrontage'].value_counts()\n\n#%%\n\n\n\n#%%\n\ntrain[['LotFrontage', 'LotFrontage_missing']]\n\n#%%\n\ntrain['LotFrontage'] = np.where(\n train['LotFrontage_missing'] == True, np.nan, train['LotFrontage'])\n\n\n# ***\n\n#%%\n\n# Create AlleyAccess_Flag\ndf['Alley'].value_counts()\n\n#%%\n\n# ?np.where\n\n#%%\n\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\n\n#%%\n\ndf.head(5)\n\n#%%\n\ndf[(df['AlleyAccess_Flag'] == 1)].head(5)\n\n#%%\n\ndf['Alley'] = df['Alley'].fillna('no_access')\n\n#%%\n\ndf['MasVnrType'].value_counts()\n\n#%%\n\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\n\n#%%\n\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\n\n\n# ***\n\n#%%\n\ndf[(df.BsmtQual_missing == True)]\n\n#%%\n\ndf.BsmtCond.value_counts()\n\n#%%\n\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.Electrical_missing == True)]['Utilities']\n# Given the record shows electricity is present, replace with typical electrical system from dataset\n\n#%%\n\ndf.Electrical.value_counts()\n\n#%%\n\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\n\n\n# ***\n\n#%%\n\ndf[(df.FireplaceQu_missing == True)]['Fireplaces'].sum()\n# Doesn't look there are any fireplaces in places with fireplaces missing\n\n#%%\n\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf[(df.GarageType_missing == True)][['GarageType', 'GarageYrBlt',\n 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']]\n# Doesn't look like there are any cases where there is garage relevant data\n\n#%%\n\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\n\n\n# ****\n\n#%%\n\ndf[df.PoolQC_missing == True]['PoolArea'].sum()\n# Check if any areas without pool data recorded have a pool in the mix\n\n#%%\n\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\n\n\n# ***\n\n#%%\n\ndf['Fence'] = df['Fence'].fillna('NA')\n\n#%%\n\ndf['MiscFeature'] = df['MiscFeature'].fillna('NO_MISC_FEATURE_RECORDED')\n\n\n# ### 1 Summary: Capture all adjustments in a single step\n\n#%%\n\n# Capture all adjustments to deal with NaN values.\ndef denote_null_values(df):\n \"\"\"Denotes whether or not there are null values or not\"\"\"\n empty_cols_query = df.isnull().sum() > 0\n empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()\n for col in empty_df_cols:\n col_name = f\"{col}_missing\"\n df[col_name] = pd.isnull(df[col])\n return df\n\n\ndf = denote_null_values(df)\n\n# LotFrontage Functions to populate training, test and validation\n\n\ndef LotFrontage_na_calc(training_df):\n lotfrontage_neighborhood_mean = training_df.groupby(\n by=['Neighborhood'])[['LotFrontage']].mean().reset_index()\n lotfrontage_neighborhood_mean.columns = [\n 'Neighborhood', 'LotFrontage_Neighborhood_Mean']\n return lotfrontage_neighborhood_mean\n\n\ndef LotFrontage_na_apply(training_df, testing_df, validation_df=None):\n # Calc mean based on training data\n lnm = LotFrontage_na_calc(training_df)\n\n # Apply mean to training data - for neighbourhood\n # Reset LotFrontage NaN in case they have been filled in a prior run\n training_df['LotFrontage'] = np.where(\n training_df['LotFrontage_missing'] == True, np.nan, training_df['LotFrontage'])\n training_df = training_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n training_df['LotFrontage'] = training_df['LotFrontage'].fillna(\n training_df.LotFrontage_Neighborhood_Mean)\n training_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n\n # Apply mean to testing data\n # Reset LotFrontage NaN in case they have been filled in a prior run\n testing_df['LotFrontage'] = np.where(\n testing_df['LotFrontage_missing'] == True, np.nan, testing_df['LotFrontage'])\n testing_df = testing_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n testing_df.LotFrontage_Neighborhood_Mean)\n testing_df.drop('LotFrontage_Neighborhood_Mean', axis=1, inplace=True)\n # Fill the training sample mean if a specific neighborhood is missing from the training sample\n testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n\n if validation_df is None:\n return training_df, testing_df\n else:\n # Apply mean to validation data set\n validation_df['LotFrontage'] = np.where(\n validation_df['LotFrontage_missing'] == True, np.nan, validation_df['LotFrontage'])\n validation_df = validation_df.merge(\n lnm, how='left', left_on='Neighborhood', right_on='Neighborhood')\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n validation_df.LotFrontage_Neighborhood_Mean)\n validation_df.drop('LotFrontage_Neighborhood_Mean',\n axis=1, inplace=True)\n validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(\n training_df['LotFrontage'].mean())\n return training_df, testing_df, validation_df\n\n\n# Other fills don't rely on knowledge of full sample to update\ndf['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(), 0, 1)\ndf['MasVnrType'] = df['MasVnrType'].fillna('None')\ndf['MasVnrArea'] = df['MasVnrArea'].fillna(0)\ndf['BsmtQual'] = df['BsmtQual'].fillna('NA')\ndf['BsmtCond'] = df['BsmtCond'].fillna('NA')\ndf['BsmtExposure'] = df['BsmtExposure'].fillna('NA')\ndf['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')\ndf['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')\ndf['Electrical'] = df['Electrical'].fillna('SBrkr')\ndf['FireplaceQu'] = df['FireplaceQu'].fillna('NA')\ndf['GarageType'] = df['GarageType'].fillna('NA')\ndf['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)\ndf['GarageFinish'] = df['GarageFinish'].fillna('NA')\ndf['GarageQual'] = df['GarageQual'].fillna('NA')\ndf['GarageCond'] = df['GarageCond'].fillna('NA')\ndf['PoolQC'] = df['PoolQC'].fillna('NA')\ndf['Fence'] = df['Fence'].fillna('NA')\ndf['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')\n\n#%%\n\n# ## 2. Create additional bespoke data features\n\n#%%\n\n# Created df['AlleyAccess_Flag'] above\n\n\n# ***\n\n#%%\n\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\n\n#%%\n\ndf['BsmtFinSF_Total'].isnull().sum()\n\n\n# ***\n\n#%%\n\ndf['Functional'].value_counts()\n\n#%%\n\nnp.where(df['Functional'] == 'Typ', 1, 0).sum()\n\n#%%\n\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf.head(5)\n\n\n# ***\n\n#%%\n\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf.head(5)\n\n#%%\n\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf.head(5)\n\n#%%\n\ndf[(df['HasPorch_flag'] == 0)].head(5)\n\n\n# ***\n\n#%%\n\ndf['PoolQC'].value_counts()\n\n#%%\n\nnp.where(df['PoolQC'] != 'NA', 1, 0).sum()\n\n#%%\n\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n#%%\n\ndf[(df['HasPool_flag'] == 1)].head(10)\n\n\n# ***\n\n# ## Section 2 summary - all code in one step\n\n#%%\n\n# Additional data features to tidy things up; potentially drop some others\ndf['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2']\ndf['Functional_Typical_flag'] = np.where(df['Functional'] == 'Typ', 1, 0)\ndf['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF'] +\n df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])\ndf['HasPorch_flag'] = np.where(df['PorchSF_Total'] > 0, 1, 0)\ndf['HasPool_flag'] = np.where(df['PoolQC'] != 'NA', 1, 0)\n\n\n# # 3. Create manual OneHotEncoding\n#\n# This is required for 6 columns in the data, each of which contain multiple pieces of information\n# * Condition1 & Condition2\n# * Exterior1st & Exterior2nd\n# * BsmtFinType1 & BsmtFinType2\n#\n# This will be set up as 3 functions that put in place the coding for a data frame.\n#\n# All info will be combined in a summary in a final cell.\n\n#%%\n\ndf['Condition1'].value_counts()\n\n#%%\n\ndf['Condition2'].value_counts()\n\n#%%\n\ndf['Condition1']\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n#%%\n\ndf_test = df_orig.copy()\ndf_test1 = ManualOneHotEncoding(\n df_test, ['Condition1', 'Condition2'], 'Condition')\ndf_test1.head(10)\n\n#%%\n\ndf_test1[(df_test1['Condition_PosA'] == 1)][['Condition1',\n 'Condition2', 'Condition_Artery', 'Condition_PosA']]\n\n\n# **Up to figuring out how to implement column checks for manual onehot encoding**\n#\n\n#%%\n\ncolumn_list = ['Condition1', 'Condition2'] # ['a','b','c','d']\n\nfor i, col in enumerate(column_list):\n print(f\"{i}: {col}\")\n\n#%%\n\ndf[((df['Condition1'] == 'Artery') | (df['Condition2'] == 'Artery'))].head(10)\n\n#%%\n\ncol1 = 'Condition1'\ncol2 = 'Condition2'\nonehot_target = 'Artery'\n\ncol_cond = (df[col1] == onehot_target)\ncol_cond_a = col_cond\ncol_cond = col_cond | (df[col2] == onehot_target)\ncol_cond_b = col_cond\nnp.where(cond, 1, 0)\n\n\n# ***\n# ### Section 3 Summary - All Code in one step\n\n#%%\n\ndef ManualOneHotEncoding(df, column_list, ohc_prefix):\n # Identify values for new one hot encoded columns\n\n unique_col_vals = []\n\n for i, col in enumerate(column_list):\n if i == 0:\n unique_col_vals = df[col].unique().tolist()\n else:\n [unique_col_vals.append(j) for j in df[col].unique().tolist()]\n\n # Limit to unique values to generate columns\n unique_col_vals_set = set(unique_col_vals)\n new_cols = sorted(list(unique_col_vals_set))\n\n # Create and populate columns for data set\n for col in new_cols:\n new_col = ohc_prefix + '_' + col\n df[new_col] = 0 # Create new columns and set to 0\n onehot_target = col\n for i, target_cols in enumerate(column_list):\n if i == 0:\n where_conditions = (df[target_cols] == onehot_target)\n else:\n where_conditions = where_conditions | (\n df[target_cols] == onehot_target)\n # Populate with 0s & 1s\n df[new_col] = np.where(where_conditions, 1, 0)\n\n return df\n\n\n# Populate OneHotEncoded Columns\ndf = ManualOneHotEncoding(df, ['Condition1', 'Condition2'], 'Conditions')\ndf = ManualOneHotEncoding(df, ['Exterior1st', 'Exterior2nd'], 'Exterior')\ndf = ManualOneHotEncoding(df, ['BsmtFinType1', 'BsmtFinType2'], 'BsmtFinType')\n\n# Drop OneHotEncoded Columns\ndf.drop('Condition1', axis=1, inplace=True)\ndf.drop('Condition2', axis=1, inplace=True)\ndf.drop('Exterior1st', axis=1, inplace=True)\ndf.drop('Exterior2nd', axis=1, inplace=True)\ndf.drop('BsmtFinType1', axis=1, inplace=True)\ndf.drop('BsmtFinType2', axis=1, inplace=True)\n\n#%%\n\ndf.head(10)\n\n#%%\n\n# ***\n# ## 4. Set up target encoding parameters\n\n#%%\n\ntarg_enc_cols = [\n 'MSSubClass',\n 'MSZoning',\n 'LandContour',\n 'Neighborhood',\n 'BldgType',\n 'HouseStyle',\n 'RoofStyle',\n 'RoofMatl',\n 'MasVnrType',\n 'Foundation',\n 'Heating',\n 'Electrical',\n 'Functional',\n 'GarageType',\n 'Fence',\n 'SaleType',\n 'SaleCondition',\n]\ntarget_enc = ce.TargetEncoder(\n verbose=1, cols=targ_enc_cols, min_samples_leaf=5, smoothing=0.1)\ntarget_enc.get_params()\n\n# Keep min_samples_leaf / smoothing in order to enable these variables to be adjusted as test different model pipelines\n\n#%%\n\ndf_te = target_enc.fit_transform(df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_te.head(5)\n\n\n# ***\n# ## 4. Set up Ordinal encoding parameters\n\n#%%\n\nordenc_cols = [\n 'LotShape',\n 'Utilities',\n 'LotConfig',\n 'LandSlope',\n 'ExterQual',\n 'ExterCond',\n 'BsmtQual',\n 'BsmtCond',\n 'BsmtExposure',\n 'HeatingQC',\n 'KitchenQual',\n 'FireplaceQu',\n 'GarageFinish',\n 'GarageQual',\n 'GarageCond',\n 'PavedDrive',\n 'PoolQC',\n]\n\nordenc_maps = [\n {'col': 'LotShape', 'mapping': {\"Reg\": 0, \"IR1\": 1, \"IR2\": 2, \"IR3\": 3}},\n {'col': 'Utilities', 'mapping': {\"AllPub\": 0, \"NoSwer\": 1, \"NoSeWa\": 2, \"ELO\": 3}},\n {'col': 'LotConfig', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'LandSlope', 'mapping': {'Gtl': 1, 'Mod': 2, 'Sev': 3, }},\n {'col': 'ExterQual', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'ExterCond', 'mapping': {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtQual', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtCond', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'BsmtExposure', 'mapping': {\n 'Gd': 1, 'Av': 2, 'Mn': 3, 'No': 4, 'NA': 5, }},\n {'col': 'HeatingQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'KitchenQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'FireplaceQu', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageFinish', 'mapping': {'Fin': 1, 'RFn': 2, 'Unf': 3, 'NA': 4, }},\n {'col': 'GarageQual', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'GarageCond', 'mapping': {\n 'NA': 0, 'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n {'col': 'PavedDrive', 'mapping': {'Y': 1, 'P': 2, 'N': 3}},\n {'col': 'PoolQC', 'mapping': {'NA': 0, 'Ex': 1,\n 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, }},\n]\n\nordinal_enc = ce.OrdinalEncoder(\n cols=ordenc_cols, mapping=ordenc_maps, verbose=1)\nordinal_enc.get_params()\n\n#%%\n\ndf_oe = ordinal_enc.fit_transform(\n df.drop('SalePrice', axis=1), df['SalePrice'])\n\n#%%\n\ndf_oe.head(5)\n\n\n# ***", "original_comment": "# ## 5. Set up OneHot encoding parameters\n", "target_code": "onehot_enc = ce.OneHotEncoder(verbose=1, cols=[\n 'Street', 'Alley', 'CentralAir', 'MiscFeature'], use_cat_names=True)\nonehot_enc.get_params()\n", "project_metadata": {"full_name": "JonathanBechtel/DAT-10-19", "description": "GitHub Repo For DAT 10-19", "topics": [], "git_url": "git://github.com/JonathanBechtel/DAT-10-19.git", "stars": 2, "watchers": 2, "forks": 11, "created": "2020-10-19T14:53:15Z", "size": 108252, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 72671490, "HTML": 915086, "Python": 92446, "Shell": 222}, "last_updated": "2021-01-06T23:37:08Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "compatibility": "Strongly disagree", "compatibility-score": 0, "precision": "Agree", "precision-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "df_ohe = pd.get_dummies(df, columns=ordinal_cols)\ndf_ohe.head(5)\n", "model": "natural", "intent": "# 5. Set up OneHot encoding parameters"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n\ninput_shape\n\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Convolutional Neural Networks\n#\n# In this assignment, we will learn about convolutional neural networks. We will create a CNN and learn to classify image data.\n\n# In this lecture, we will use the image data generator to classify our data. The data is loaded below:\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom tensorflow.keras.preprocessing.image import ImageDataGenerator\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Conv2D, MaxPooling2D\nfrom tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model\n\n#%%\n\nPATH = '/content/drive/MyDrive/content'\ntrain_data_dir = PATH + '/dogs-vs-cats/train/'\ntest_data_dir = PATH + '/dogs-vs-cats/test/'\n\nimg_width, img_height = 150, 150\nbatch_size = 80\n\n#%%\n\n# This block of code is used to ensure the input shape is correct\n\nif K.image_data_format() == 'channels_first':\n input_shape = (3, img_width, img_height)\nelse:\n input_shape = (img_width, img_height, 3)\n\n\n# Define a train data generator with shear range of 0.3, zoom range of 0.1 and rescale to 1./255 (note that we must make 1 a float to produce a correct fraction). Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n\n# parameters\nclass_mode = 'binary'\n\n# create generator\ndatagen = ImageDataGenerator(rescale=1./255., validation_split=0.25,\n shear_range=0.3,\n zoom_range=0.1)\n\n\n# Define a test data generator that only rescales to 1./255. Use the ImageDataGenerator function.\n\n#%%\n\n# Answer below:\n# create generator\ntestgen = ImageDataGenerator(rescale=1./255., shear_range=0.3,\n zoom_range=0.1,)\n\n\n# The train generator and the test generator are defined below:\n\n#%%\n\n# prepare an iterators for each dataset\ntrain = datagen.flow_from_directory(train_data_dir,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n shuffle=True,\n batch_size=32,\n subset=\"training\")\n\nvalid = datagen.flow_from_directory(train_data_dir,\n shuffle=True,\n class_mode=class_mode,\n target_size=(img_width, img_height),\n batch_size=32,\n subset=\"validation\")\n\n\n# Shuffle off for test data so that I can run the classification report against prediction made on this data.\ntest = testgen.flow_from_directory(test_data_dir,\n shuffle=False,\n class_mode=class_mode,\n batch_size=10,\n target_size=(img_width, img_height))\n# confirm the iterator works\nbatchX, batchy = train.next()\nprint('Batch shape=%s, min=%.3f, max=%.3f' %\n (batchX.shape, batchX.min(), batchX.max()))\n\n\n# We'll start with a simple model. In CNNs, we first convolve the to extract features and then we add the dense layers.\n#\n# Create a model with one layer of convolution of size 64, one layer of activation, one layer of max pooling with pool size (2,2) and then one flattening layer, one dense layer of unit size 64 with a ReLU activation and one dense output layer. The output layer should have a sigmoid activation.\n\n#%%\n\ninput_shape\n\n#%%\n\n# Answer below:\n\nCNN_model = Sequential()\n\n# Input Layer\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Output Layer\nCNN_model.add(Flatten())\nCNN_model.add(Dense(64, activation='relu'))\nCNN_model.add(Dense(1, activation='sigmoid'))\n\n#%%\n\nCNN_model.summary()\n\n\n# Compile the model using RMSprop.\n\n#%%\n\n# Answer below:\nCNN_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n\n# Fit the model using a fit generator. Use 50 epochs, 25 training steps and 15 validation steps\n\n#%%\n\nEPOCHS = 50\nSTEP_SIZE_TRAIN = 25\nSTEP_SIZE_VALID = 15\n\n# Answer below:\nCNN_history = CNN_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\nhistory = pd.DataFrame(CNN_history.history)\nhistory['model'] = \"One\"\n\n\n# Create a new model by adding an additional group of convolution, activation and max pooling layers before the flatten layer. Make the convolution layer of unit size 32. Keep everything else the same.\n\n#%%\n\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n# Second Convolutional layer.\nnew_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n\n\n# Output Layer\nnew_model.add(Flatten())\nnew_model.add(Dense(64, activation='relu'))\nnew_model.add(Dense(1, activation='sigmoid'))\n\n\n# Fit and compile the model in the same way you did with the previous model. How did the results improve?\n\n#%%\n\n# Answer below:\n# Answer below:\nnew_model.compile(optimizer='rmsprop',\n loss=\"binary_crossentropy\", metrics=[\"MSE\", \"accuracy\"])\n\n# Answer below:\nnew_history = new_model.fit(train,\n steps_per_epoch=STEP_SIZE_TRAIN,\n validation_data=valid,\n validation_steps=STEP_SIZE_VALID,\n epochs=EPOCHS)\n\nnew_history = pd.DataFrame(new_history.history)\nnew_history['model'] = 'Two'\nhistory = pd.concat([history, new_history])\n\n\n# It looks like there isn't an improvement.\n\n# Create a new model based on the model above. Add an additional dense layer of size 64 with a ReLU activation after the flatten layer.\n\n#%%\n\n# Answer below:\n# Answer below:\n# Answer below:\n\nnew_model = Sequential()\n\n# Input Layer\nnew_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))", "original_comment": "# Second Convolutional layer.\n", "target_code": "new_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nnew_model.add(Activation('relu'))\nnew_model.add(MaxPooling2D(pool_size=(2, 2)))\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Disagree", "precision-score": 1, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "CNN_model = Sequential()\nCNN_model.add(Conv2D(64, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\nCNN_model.add(Conv2D(32, (3, 3), padding='same',\n input_shape=input_shape))\nCNN_model.add(Activation('relu'))\nCNN_model.add(MaxPooling2D(pool_size=(2, 2)))\nCNN_model.\n", "model": "no-comments", "intent": "# Second Convolutional layer."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n\n', '.join([cat for cat in fs_categories])\n\n\ncinema = df_cinemas.loc[0]\n\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Capstone Project - The Battle of Neighborhoods\n\n# ## Introduction\n\n# Introduction where you discuss the business problem and who would be interested in this project.\n\n# #### \"Would you recommend a location in Hong Kong to open a new cinema?\"\n# My boss, the stakeholder wants to **open a new cinema as company's new business**.\n#\n# He explains that watching movie is a part of whole afternoon or night activities. Cinema should has **many restaurants and shopping places nearby**. Transportation is also an important factor. Customer can walk to cinema within **5 minutes** from **public transport facilities** is perfect.\n#\n# He wants me concentrated on selection of cinema location according to its nearby environment. Cinema facility and rental price is not my concern. He lists out his **top 10 favorite cinemas** in Hong Kong with rating.\n#\n# I work with my teammates and select **5 possible locations** to build the cinema. Which location should be suggested to the stakeholder?\n\n# ## Data\n\n# Data where you describe the data that will be used to solve the problem and the source of the data.\n\n# According to the question, following data are required.\n\n# #### 1. Geographic coordinate of Hong Kong cinemas\n#\n# I need to **compare 5 possible locations with current cinemas** in Hong Kong. Therefore, I need to find a list of Hong Kong cinema and cinemas' geographic coordinates. Luckily, I can find the list and coordinates from the website https://hkmovie6.com/cinema .\n\n#%%\n\n# Import necessary library\nfrom sklearn.preprocessing import MinMaxScaler\nimport folium\nfrom scipy import stats\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport matplotlib\nfrom pathlib import Path\n# tranform JSON file into a pandas dataframe\nfrom pandas.io.json import json_normalize\nimport foursquare\nimport googlemaps\nimport json\nimport pandas as pd\n\n#%%\n\n# Download the cinema list\nget_ipython().system('wget -O hk_cinema_list.json https://hkmovie6.com/api/cinemas/lists')\n\n#%%\n\n# Convert the JSON data into DataFrmae\ncinemas_json = None\nwith open('hk_cinema_list.json', 'r', encoding='utf-8') as f:\n cinemas_json = json.load(f)\n\ncinemas = []\nfor data in cinemas_json['data']:\n cinemas.append({\n 'Name': data['name'],\n 'ChiName': data['chiName'],\n 'Address': data['address'],\n 'Latitude': data['lat'],\n 'Longitude': data['lon']\n })\ndf_cinemas = pd.DataFrame(\n cinemas, columns=['Name', 'ChiName', 'Address', 'Latitude', 'Longitude'])\n\n#%%\n\nprint('There are {} cinemas in Hong Kong'.format(len(df_cinemas)))\n\n\n# First five records of Hong Kong cinemas\n\n#%%\n\ndf_cinemas.head()\n\n\n# #### 2. Geographic coordinates of 5 possible cinema addresses\n# Geographic coordinates of 5 possible cinemas are required and I can use Google Map API to find this information\n\n#%%\n\npossible_locations = [\n {'Location': 'L1', 'Address': 'Sau Mau Ping Shopping Centre, Sau Mau Ping'},\n {'Location': 'L2', 'Address': 'Tuen Mun Ferry, Tuen Mun'},\n {'Location': 'L3', 'Address': 'Un Chau Shopping Centre, Cheung Sha Wan'},\n {'Location': 'L4', 'Address': 'Prosperity Millennia Plaza, North Point'},\n {'Location': 'L5', 'Address': 'Tsuen Fung Centre Shopping Arcade, Tsuen Wan'},\n]\n\n#%%\n\n# install the google map api client library\nget_ipython().system('pip install -U googlemaps')\n\n#%%\n\ngoogle_act = None\nwith open('google_map_act.json', 'r') as f:\n google_act = json.load(f)\n\nGOOGLE_MAP_API_KEY = google_act['api_key']\n\ngmaps = googlemaps.Client(key=GOOGLE_MAP_API_KEY)\n\n#%%\n\n# Retrieve geolocation and create the dataframe of pending cinema addresses\ndef getLatLng(address):\n latlnt = gmaps.geocode('{}, Hong Kong'.format(address))\n return (latlnt[0]['geometry']['location']['lat'], latlnt[0]['geometry']['location']['lng'])\n\n\n# Dataframe of 5 target locations with geographic coordinates information\n\n#%%\n\nfor loc in possible_locations:\n (lat, lng) = getLatLng(loc['Address'])\n loc['Latitude'] = lat\n loc['Longitude'] = lng\n\ndf_possible_locations = pd.DataFrame(possible_locations, columns=[\n 'Location', 'Address', 'Latitude', 'Longitude'])\ndf_possible_locations\n\n\n# #### 3. Favorite cinema list of stakeholder\n\n# The favorite cinema list of stakeholder is an important information that I can **use it as profile to select the best location**.\n\n#%%\n\nboss_favorite = [\n {'Name': 'Broadway Circuit - MONGKOK', 'Rating': 4.5},\n {'Name': 'Broadway Circuit - The ONE', 'Rating': 4.5},\n {'Name': 'Grand Ocean', 'Rating': 4.3},\n {'Name': 'The Grand Cinema', 'Rating': 3.4},\n {'Name': 'AMC Pacific Place', 'Rating': 2.3},\n {'Name': 'UA IMAX @ Airport', 'Rating': 1.5},\n]\n\ndf_boss_favorite = pd.DataFrame(boss_favorite, columns=['Name', 'Rating'])\ndf_boss_favorite\n\n\n# #### 4. Eating, Shopping and Public transportation facility around cinema\n# The recommended cinema location needs to have many eating and shopping venues nearby. Convenient public transport is also required.\n# These data can be found by using FourSquare API to find these venues around the location. The radius of exploration distance is set to 500 meters, which is about 5 minutes walking distance.\n\n# Following type of venue category will be used to search\n\n#%%\n\nfs_categories = {\n 'Food': '4d4b7105d754a06374d81259',\n 'Shop & Service': '4d4b7105d754a06378d81259',\n 'Bus Stop': '52f2ab2ebcbc57f1066b8b4f',\n 'Metro Station': '4bf58dd8d48988d1fd931735',\n 'Nightlife Spot': '4d4b7105d754a06376d81259',\n 'Arts & Entertainment': '4d4b7104d754a06370d81259'\n}\n\n#%%\n\n', '.join([cat for cat in fs_categories])\n\n#%%\n\ncinema = df_cinemas.loc[0]\n\n#%%\n\nprint('Use the first cinema \"{}\" in the list as example to explore venues nearyby'.format(\n cinema['Name']))\n\n#%%\n\n# Install FourSquare client library\nget_ipython().system('pip install foursquare')\n\n#%%\n\nfs_act = None\nwith open('fs_act.json') as json_data:\n fs_act = json.load(json_data)\n\n#%%\n\nfs = foursquare.Foursquare(\n client_id=fs_act['client_id'], client_secret=fs_act['client_secret'])\n\n#%%\n\nRADIUS = 500 # 500m, around 5 minutes walking time\n\n#%%\n\n# Define a function to search nearby information and convert the result as dataframe\ndef venues_nearby(latitude, longitude, category, verbose=True):\n results = fs.venues.search(\n params={\n 'query': category,\n 'll': '{},{}'.format(latitude, longitude),\n 'radius': RADIUS,\n 'categoryId': fs_categories[category]\n }\n )\n df = json_normalize(results['venues'])\n cols = ['Name', 'Latitude', 'Longitude', 'Tips', 'Users', 'Visits']\n if(len(df) == 0):\n df = pd.DataFrame(columns=cols)\n else:\n df = df[['name', 'location.lat', 'location.lng',\n 'stats.tipCount', 'stats.usersCount', 'stats.visitsCount']]\n df.columns = cols\n if(verbose):\n print('{} \"{}\" venues are found within {}m of location'.format(\n len(df), category, RADIUS))\n return df\n\n\n# Find Metro Station around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Metro Station').head()\n\n\n# Find Bus Stop around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Bus Stop').head()\n\n\n# Find eating places around the cinema\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'], 'Food').head()\n\n#%%\n\nvenues_nearby(cinema['Latitude'], cinema['Longitude'],\n 'Arts & Entertainment').head()\n\n\n# ## Methodology\n\n# Methodology section which represents the main component of the report where you discuss and describe any exploratory data analysis that you did, any inferential statistical testing that you performed, and what machine learnings were used and why.\n\n# With above data, I can use content-based recommendation technique to resolve the problem.\n#\n# Combine with FourSquare API which provides how many venues in different category of Hong Kong cinemas, a matrix which captured characteristic of venues nearby cinema are built. Stakeholder's favorite list is the profile to combine with the matrix to become a weighted matrix of favorite cinema.\n#\n# The weighted matrix can be applied on 5 target locations with venues information to generate a ranking result. The the top one on the ranking list can be recommended to the stakeholder.\n#\n# Before building the matrix, I have to prepare the required data and apply some data analysis.\n\n# #### Data Cleansing and Preparation", "original_comment": "# Check the cinemas dataset contains any duplicated address\n", "target_code": "duplicated = df_cinemas.duplicated('Address', keep=False)\n", "project_metadata": {"full_name": "meghsat/CourseraIBMdatascience_course", "description": "In this repo consists of the projects I had done as part of the coursera's IBM data science Professional certificate.", "topics": [], "git_url": "git://github.com/meghsat/CourseraIBMdatascience_course.git", "stars": 3, "watchers": 3, "forks": 0, "created": "2020-04-08T05:37:45Z", "size": 4855, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 14626378}, "last_updated": "2020-05-28T09:51:40Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "compatibility": "Agree", "compatibility-score": 2, "precision": "Agree", "precision-score": 2, "coverage": "Disagree", "coverage-score": 1, "usefulness": "Agree", "usefulness-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Disagree", "coverage-score": 1, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "cinema.drop_duplicates(inplace=True)\n", "model": "docstring", "intent": "# Check the cinemas dataset contains any duplicated address"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ### MNIST LeNet5\n# ---\n# Zhiang Chen\n#\n# July 2016\n\n# #### 1. Import Packages\n\n#%%\n\nfrom __future__ import print_function\nimport time\nimport numpy as np\nimport tensorflow as tf\n\n\n# #### 2. Load Data\n\n#%%\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nmnist = input_data.read_data_sets('MNIST_data', one_hot=True)\n\n\n# #### 3. Pre-process\n\n#%%\n\nimage_size = 28\nnum_channels = 1 # greyscale\n\ntrain_data = mnist.train.images\ntrain_labels = mnist.train.labels\nvalidation_data = mnist.validation.images\nvalidation_labels = mnist.validation.labels\ntest_data = mnist.test.images\ntest_labels = mnist.test.labels\n\n\ndef reformat(data):\n reformated_data = data.reshape(-1, image_size,\n image_size, num_channels).astype(np.float32)\n return reformated_data\n\n\ntrain_dataset = reformat(train_data)\nvalidation_dataset = reformat(validation_data)\ntest_dataset = reformat(test_data)", "original_comment": "# print out all data shapes\n", "target_code": "print('Training set', train_dataset.shape, train_labels.shape)\nprint('Validation set', validation_dataset.shape, validation_labels.shape)\nprint('Test set', test_dataset.shape, test_labels.shape)\n", "project_metadata": {"full_name": "cwru-robotics/cwru_dnn", "description": "deep neural net explorations", "topics": [], "git_url": "git://github.com/cwru-robotics/cwru_dnn.git", "stars": 3, "watchers": 3, "forks": 2, "created": "2016-07-25T14:47:31Z", "size": 49625, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 329694, "Python": 19000, "C++": 17781, "CMake": 7310}, "last_updated": "2020-03-13T14:59:53Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "def fully_connected(prev_layer, num_units):\n \"\"\"\n Create a fully connectd layer with the given layer as input and the given number of neurons.\n :param prev_layer: Tensor\n The Tensor that acts as input into this layer\n :param num_units: int\n The size of the layer. That is, the number of units, nodes, or neurons.\n :returns Tensor\n A new fully connected layer\n \"\"\"\n layer = tf.layers.dense(prev_layer, num_units, activation=tf.nn.relu)\n return layer\n", "model": "no-comments", "intent": "# print out all data shapes"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n\ndf['source'].value_counts()\n\n\ndf.groupby(['source', 'partition']).size()\n\n\ndf['digit'].value_counts()\n\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n\ntarget_shape = (32, 32)\n\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n\nMODEL_DIR\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n\nwandb.init()\n\n\n# ### Run Training\n\n\nMODEL_DIR\n\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n\nget_ipython().system('ls $MODEL_DIR')\n\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nfrom scipy.stats import describe\nfrom skimage import img_as_ubyte, img_as_float32\nfrom imgaug import augmenters as iaa\nfrom sklearn.model_selection import train_test_split\nimport os\nimport re\nimport glob\nimport os.path as osp\nimport pandas as pd\nimport numpy as np\nimport tqdm\nimport matplotlib.pyplot as plt\nfrom skimage import io as sk_io\nfrom skimage.transform import resize\nfrom skimage.color import gray2rgb, rgb2gray\nfrom skimage.exposure import rescale_intensity\nfrom cvutils.rectlabel import io as rectlabel_io\nfrom cvutils.mrcnn.session import init_keras_session\nfrom cvutils import visualize\nfrom celldom.dataset import utils as dataset_utils\nfrom celldom import seed\n\nimport celldom\nimport keras\nfrom keras.preprocessing import image\nimport wandb\nfrom wandb.keras import WandbCallback\n\ninit_keras_session()\n\nDATA_DIR_VALIDATED_EXTRACT = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'validated', 'single')\nDATA_DIR_MANUAL_ANNOTATION = osp.join(\n celldom.get_dataset_dir(), 'training', 'digit', 'r0.6', 'manual', 'single')\nDATA_DIR_MANUAL_PARTITIONS = ['pt1', 'pt2', 'pt3', 'pt4']\n#MODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'], 'model', 'r0.6', 'digit_model')\nMODEL_DIR = osp.join(os.environ['CELLDOM_DATA_DIR'],\n 'model', 'r0.7', 'digit_model')\n\nMODEL_PATH_HEAD = osp.join(MODEL_DIR, 'single_digit_model_headonly.h5')\nMODEL_PATH = osp.join(MODEL_DIR, 'single_digit_model.h5')\n\nMODEL_DIR, DATA_DIR_VALIDATED_EXTRACT, DATA_DIR_MANUAL_ANNOTATION\n\n#%%\n\nget_ipython().system('ls $DATA_DIR_VALIDATED_EXTRACT | head -n 1')\n\n#%%\n\ndef get_actual_digit(file):\n return osp.basename(file).replace('.jpeg', '').split('-')[-1]\n\n\ndef load_extracted_images():\n res = []\n for f in glob.glob(osp.join(DATA_DIR_VALIDATED_EXTRACT, '*.jpeg')):\n digit = get_actual_digit(f)\n img = sk_io.imread(osp.join(DATA_DIR_VALIDATED_EXTRACT, f))\n res.append((int(digit), img, f))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file'])\n return res\n\n\ndef load_annotated_images():\n res = []\n for pt in DATA_DIR_MANUAL_PARTITIONS:\n df = dataset_utils.get_data_files(\n osp.join(DATA_DIR_MANUAL_ANNOTATION, pt))\n for i, r in tqdm.tqdm(df.iterrows(), desc='partition=' + pt):\n img_path, annot_path = r['image_path'], r['annot_path']\n if r['annot_exists']:\n try:\n shape, annotations = rectlabel_io.load_annotations(\n annot_path, assert_masks=False)\n assert len(annotations) == 1, 'Found multiple annotations in {}'.format(\n annot_path)\n digit = annotations[0].object_type\n if digit == 'NA':\n continue\n # Read RGB 8-bit image\n img = sk_io.imread(img_path)\n\n # Extract grayscale based on assumption of channel equality\n assert img.ndim == 3 and img.shape[-1] == 3\n assert np.allclose(img[..., 0], img[..., 1])\n assert np.allclose(img[..., 0], img[..., 2])\n img = img[..., 0]\n digit = int(digit)\n except:\n print('Failure occurred for annotation file {}'.format(annot_path))\n raise\n res.append((digit, img, img_path, pt))\n res = pd.DataFrame(res, columns=['digit', 'img', 'file', 'partition'])\n return res\n\n#%%\n\nget_ipython().run_cell_magic('time', '',\n \"df = pd.concat([\\n load_extracted_images().assign(source='extract'),\\n load_annotated_images().assign(source='annotated')\\n])\")\n\n#%%\n\n# r0.2 num files = 2372, r0.6 = 8067, r0.7 = 9375\nlen(df)\n\n#%%\n\ndf['source'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'partition']).size()\n\n#%%\n\ndf['digit'].value_counts()\n\n#%%\n\ndf.groupby(['source', 'digit']).size().unstack()\n\n#%%\n\ndtypes = df['img'].apply(lambda v: v.dtype)\nassert np.all(dtypes == np.uint8)\ndtypes.value_counts()\n\n#%%\n\npd.DataFrame([r['img'].shape for _, r in df.iterrows()]).describe()\n\n#%%\n\nidx = np.arange(len(df))\n\n# Vary test_size (starting large) to get a sense of accuracy with a bigger sample and then decrease for final model\nidx_train, idx_test = train_test_split(\n idx, test_size=.2, random_state=seed, stratify=df['digit'])\ndf_train, df_test = df.iloc[idx_train], df.iloc[idx_test]\n\nlen(df_train), len(df_test)\n\n#%%\n\npd.concat([df_train['digit'].value_counts(normalize=True),\n df_test['digit'].value_counts(normalize=True)], axis=1)\n\n\n# ## Training\n\n#%%\n\nget_ipython().run_line_magic('run', 'utils.py')\n\n#%%\n\ntarget_shape = (32, 32)\n\n#%%\n\ndef sometimes(aug): return iaa.Sometimes(.2, aug)\n\n\nseq = iaa.Sequential([\n # Simulate out-of-focus\n sometimes(iaa.GaussianBlur(sigma=.3)),\n\n # Randomly alter scaling and simulate zooming\n sometimes(iaa.OneOf([\n iaa.CropAndPad(px=(0, 12)),\n iaa.Crop(px=(0, 12))\n ])),\n\n # Rotate up to 3 degrees and randomly scale\n sometimes(iaa.Affine(\n rotate=3.,\n scale={\"x\": (.8, 1.4), \"y\": (.8, 1.4)},\n translate_px=(0, 5),\n mode='constant'\n )),\n\n # Randomly alter distribution\n sometimes(iaa.OneOf([\n iaa.Multiply((.8, 1.4)),\n iaa.Add((-25, 25)),\n iaa.ContrastNormalization((0.8, 1.2))\n ]))\n], random_order=True)\n\n\ndef augment(img):\n # img should be provided as float in [0, 1]\n assert img.min() >= 0 and img.max() <= 1.\n img = rescale_intensity(img, out_range=np.uint8).astype(np.uint8)\n res = img_as_float32(seq.augment_image(img))\n #print(res.shape, res.dtype, res.min(), res.max())\n return res\n\n#%%\n\ndef prep_image(img):\n # Note that gray2rgb will take care of converting from uint8 to float in 0-1\n assert img.dtype == np.uint8\n\n # Convert to 2D with target height/width\n img = gray2rgb(resize(img, target_shape, mode='constant',\n anti_aliasing=True)).astype(np.float32)\n\n # Rescale by min/max\n img = rescale_intensity(img, out_range=(0, 1))\n\n assert np.all(img <= 1.) and np.all(img >= 0.)\n return img\n\n\ntrain_x = np.stack([prep_image(v) for v in df_train['img']], 0)\ntrain_y = keras.utils.to_categorical(df_train['digit'].values, 10)\n\ntest_x = np.stack([prep_image(v) for v in df_test['img']], 0)\ntest_y = keras.utils.to_categorical(df_test['digit'].values, 10)\n\ntrain_generator = image.ImageDataGenerator(preprocessing_function=augment)\ntrain_batches = train_generator.flow(train_x, y=train_y, seed=seed)\n\ntest_generator = image.ImageDataGenerator()\ntest_batches = test_generator.flow(test_x, y=test_y, seed=seed)\n\ntrain_x.dtype, train_x.shape, test_x.shape\n\n#%%\n\n# Visualize augmentations\naug_imgs = []\nfor img in df_train['img'].sample(n=80):\n aug_imgs.append(prep_image(img))\n aug_imgs.append(augment(prep_image(img)))\nvisualize.display_images(aug_imgs, cols=8, size=10)\n\n#%%\n\ndescribe(aug_imgs[0].ravel())\n\n\n# ### Initialize Modeling\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\n# Clear everything currently in the modeling directory\nget_ipython().system('rm -rf $MODEL_DIR/*')\n\n#%%\n\nwandb.init()\n\n\n# ### Run Training\n\n#%%\n\nMODEL_DIR\n\n#%%\n\nif not osp.exists(MODEL_DIR):\n os.makedirs(MODEL_DIR)\n\n#%%\n\nget_ipython().system('ls $MODEL_DIR')\n\n#%%\n\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='head')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.001),\n metrics=['accuracy']\n)\n\n# Initialize from pre-trained model\nmodel.load_weights('/lab/repos/svhn/weights.hdf5')\n\n#%%\n\nsave_model = keras.callbacks.ModelCheckpoint(MODEL_PATH_HEAD, monitor='val_loss', mode='min', verbose=0,\n save_best_only=True, save_weights_only=False, period=1)\nearly_stopping = keras.callbacks.EarlyStopping(\n monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min')\nlearning_rate = keras.callbacks.ReduceLROnPlateau(\n patience=3, monitor='val_loss', mode='min', verbose=1)\ncallbacks = [save_model, early_stopping, learning_rate]\n\n# tensorboard = keras.callbacks.TensorBoard(log_dir=MODEL_DIR)\n# callbacks = [save_model, early_stopping, learning_rate, tensorboard]\n\nwith wandb.monitor(display=False):\n model.fit_generator(\n train_batches,\n epochs=250,\n callbacks=callbacks + [WandbCallback(save_model=False)],\n validation_data=test_batches\n )\n\n\n# ### Fine-Tuning\n\n#%%\n\n#model = get_digit_model(input_shape=target_shape + (3,), mode='tune')\nmodel = get_digit_model(input_shape=target_shape + (3,), mode='all')\n\nmodel.compile(\n loss=keras.losses.categorical_crossentropy,\n optimizer=keras.optimizers.RMSprop(lr=.0001),\n metrics=['accuracy']\n)", "original_comment": "# Initialize from pre-trained model\n", "target_code": "model.load_weights(MODEL_PATH_HEAD)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "compatibility": "Agree", "compatibility-score": 2, "precision": "Strongly agree", "precision-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "compatibility": "Agree", "compatibility-score": 2, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3}], "predicted_code": "model.load_weights('/lab/repos/svhn/weights.hdf5')\n", "model": "natural", "intent": "# Initialize from pre-trained model"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n\n# ## Data\n\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n\n# DATA_DF.content[0]\n\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # music reviews conditioned on songs\n\n#%%\n\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nimport string\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom fastai.lm_rnn import *\nfrom fastai.nlp import *\nfrom torchtext import vocab, data\nimport spacy\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport os\nimport numpy as np\nimport dill as pickle\nfrom IPython.core.debugger import set_trace\nimport pdb\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n\nprint('cuda.is_available:', torch.cuda.is_available())\nprint(\n f'available: {torch.cuda.device_count()}; current: {torch.cuda.current_device()}')\nDEVICE = torch.device(\n f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu')\nprint(DEVICE)\nprint('pytorch', torch.__version__)\n\n#%%\n\n# ## Data\n\n#%%\n\nBASE_DIR = os.getcwd()\nDATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')\n\nDATA_F = os.path.join(DATA_DIR, f'reviews_and_metadata_5yrs.json')\nDATA_DF = pd.read_json(DATA_F)\nlen(DATA_DF)\n\n#%%\n\n# DATA_DF.content[0]\n\n#%%\n\n# remove all double-quotation marks\n#DATA_DF.content = DATA_DF.content.apply(lambda x: x.replace('\"', ''))\n# DATA_DF.content[0]\n\n#%%", "original_comment": "# remove reviews without audio features from Spotify\n", "target_code": "DATA_DF = DATA_DF.loc[~DATA_DF.audio_features.isna()]\n", "project_metadata": {"full_name": "iconix/openai", "description": "OpenAI Scholar, general materials", "topics": [], "git_url": "git://github.com/iconix/openai.git", "stars": 16, "watchers": 16, "forks": 3, "created": "2018-11-02T19:26:13Z", "size": 69033, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 22113154, "Python": 46353, "JavaScript": 8783, "Shell": 2297, "HTML": 970}, "last_updated": "2020-06-01T14:04:53Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "sp = spacy.load('en')\n", "model": "docstring", "intent": "# remove reviews without audio features from Spotify"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n\ndata[data.columns[data.dtypes == int]]\n\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)\n\n\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Principal Component Analysis Assignment\n\n#%%\n\nimport numpy as np\nimport pandas as pd\n\nfrom sklearn.decomposition import PCA\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\n\nimport seaborn as sns\nimport matplotlib.pyplot as plt\n\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ### Import the [PitchFX data set](https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv).\n\n#%%\n\ndata = pd.read_csv(\n 'https://docs.google.com/spreadsheets/d/1pmBtSw7v_tU_dIX1-4E8_Q7wC43fDs6LGDQzN49-ffk/export?format=csv')\n\n\n# ### Keep only the pitch type and the numeric columns (exluding ID fields).\n#\n# * Drop any remaining records that contain null values.\n# * Consider `pitchType` as `y`/target and the remaining columns to be `X`/features.\n\n#%%\n\ndata[data.columns[data.dtypes == int]]\n\n#%%\n\ndata.pitchType.value_counts()\nlb_make = LabelEncoder()\ndata['typeid'] = lb_make.fit_transform(data[\"pitchType\"])\ndata[[\"pitchType\", \"typeid\"]].head(11)\n\n#%%\n\ndata = data.dropna()\ny = data['typeid']\nX = data[data.columns[((data.dtypes == float) | (data.dtypes == int))]].drop(\n columns=['pitcherId', 'catcherId', 'batterId', 'umpireId', 'typeid'])\nX = X.dropna()\nX.info()\n\n\n# ### Reduce the dimensionality of the data using PCA to two components.\n#\n# Don't forget to scale.\n\n#%%\n\nscale = StandardScaler()\nX_std = scale.fit_transform(X)\npca = PCA(n_components=2)\ntwo = pca.fit_transform(X_std)", "original_comment": "# ### Compute the explained variance for new data set.\n", "target_code": "pca.get_covariance()\n", "project_metadata": {"full_name": "thinkful-dsi-grackle/dsi7_student_pair_work", "description": null, "topics": [], "git_url": "git://github.com/thinkful-dsi-grackle/dsi7_student_pair_work.git", "stars": 4, "watchers": 4, "forks": 7, "created": "2020-08-31T19:02:03Z", "size": 126351, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 401674326}, "last_updated": "2021-01-08T04:04:50Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "pca.explained_variance_ratio_\n", "model": "natural", "intent": "# Compute the explained variance for new data set."}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n\ncelldom.get_repo_dir()\n\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# ## Path Metadata Validation Example\n#\n# This notebook demonstrates how to check that the metadata associated with an experiment will be interpreted correctly based on an experiment configuration file.\n\n#%%\n\nfrom celldom.config import experiment_config\nimport celldom\nimport glob\nimport os.path as osp\nimport os\nget_ipython().run_line_magic('run', '-m celldom.nb.logging')\n\n\n# Set the experiment configuration to be loaded:\n\n#%%\n\ncelldom.get_repo_dir()\n\n#%%\n\nexp_config_path = osp.join(celldom.get_repo_dir(\n), 'config', 'experiment', 'experiment_example_G3.yaml')\nexp_config_path\n\n#%%\n\nget_ipython().system('cat $exp_config_path')\n\n\n# #### Load Configuration\n\n#%%\n\nexp_config = experiment_config.ExperimentConfig(\n celldom.read_config(exp_config_path))\n\n#%%\n\nexp_config.conf\n\n\n# #### Test Path Parsing\n\n#%%\n\n# Create a path to test parsing of metadata properties on\ntest_path = 'JeffsData/_2018.06.14 EXP SUM Control 0.1uM with 5mL gravity/2018.06.14 White 3 Control/' '2018.06.14 Pink 3 1.0uM 0 hr/BFF_16X_St_001_Apt_016_201806150024.tif'\ntest_path\n\n#%%", "original_comment": "# Test that the path can be parsed successfully\n", "target_code": "exp_config.parse_path(test_path)\n", "project_metadata": {"full_name": "hammerlab/SmartCount", "description": "Repository for collaboration on Celldom computer vision solutions", "topics": [], "git_url": "git://github.com/hammerlab/SmartCount.git", "stars": 2, "watchers": 2, "forks": 0, "created": "2018-05-14T16:08:11Z", "size": 92558, "license": "apache-2.0", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 42802909, "HTML": 358985, "Python": 244943, "Shell": 175}, "last_updated": "2020-12-04T00:25:05Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}], "predicted_code": "get_ipython().run_line_magic('run',\n '-m celldom.nb.logging\n", "model": "natural", "intent": "# Test that the path can be parsed successfully"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n\ntype(AAPL)\n\n\nAAPL.shape\n\n\nAAPL.columns\n\n\ntype(AAPL.columns)\n\n\nAAPL.index\n\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n\nAAPL.head()\n\n\nAAPL.tail()\n\n\nAAPL.info()\n\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n\nAAPL.head(7)\n\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n\nlow = AAPL.Low\n\n\ntype(low)\n\n\nlow.head()\n\n\nlows = low.values\n\n\ntype(lows)\n\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n\nwb_df.head()\n\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n\npop_df.info()\n\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n\nnp_vals\n\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n\nnp_vals_log10\n\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n\npop_df_log10\n\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n\nusers = pd.DataFrame(data)\n\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n\ndata2 = dict(zipped)\n\n\nusers2 = pd.DataFrame(data2)\n\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n\nusers\n\n\n# #### Broadcasting with a dict\n\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n\nresults = pd.DataFrame(data)\n\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n\nresults.columns = ['height (in)', 'sex']\n\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n\ndata = dict(zipped)\n\n\ndata\n\n\ndata_df = pd.DataFrame.from_dict(data)\n\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n\nbillboard_dict = dict(billboard_zipped)\n\n\nbillboard_dict\n\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n\nbillboard\n\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n\n# Print both the DataFrames\ndf1.head()\n\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n\n# Print the output of df1.head()\ndf1.head()\n\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n\naapl.head()\n\n\naapl.info()\n\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n\nclose_arr = aapl['close'].values\n\n\ntype(close_arr)\n\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n\nclose_series = aapl['close']\n\n\ntype(close_series)\n\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n\ndf.info()\n\n\ndf.head()\n\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n\niris.shape\n\n\niris.head()\n\n\n# #### Line plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n\ndata_zipped = list(zip(cols, values))\n\n\ndata_dict = dict(data_zipped)\n\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n\ndf\n\n\ndf.info()\n\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.info()\n\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n\niris['sepal length (cm)'].count() # Applied to Series\n\n\niris['sepal width (cm)'].count() # Applied to Series\n\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n\niris.min()\n\n\niris.max()\n\n\n# #### Box Plots\n\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf.fare.describe()\n\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n\n# Print the mean of the January and March data\njanuary.mean()\n\n\nmarch.mean()\n\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n\nsetosa['species'].unique()\n\n\nversicolor['species'].unique()\n\n\nvirginica['species'].unique()\n\n\nsetosa.head(2)\n\n\nversicolor.head(2)\n\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n\ndescribe_all = iris.describe()\ndescribe_all\n\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n\nsales.reindex(evening_2_11, method='ffill')\n\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n\ndf1.head()\n\n\ndf2.head()\n\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n\ndaily_mean.loc['2015-2-2']\n\n\nsales.loc['2015-2-2', 'Units']\n\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n\nsales.resample('D').sum().head()\n\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\ndf_clean.head(3)\n\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom numpy import NaN\nfrom glob import glob\nimport re\n\n#%%\n\npd.set_option('max_columns', 200)\npd.set_option('max_rows', 300)\npd.set_option('display.expand_frame_repr', True)\n\n\n# ### Data Files Location\n#\n# * Most data files for the exercises can be found [here](#https://www.datacamp.com/courses/pandas-foundations)\n# * [1981-2010 NOAA Austin Climate Normals](#https://assets.datacamp.com/production/course_1639/datasets/NOAA_QCLCD_2011_hourly_13904.txt)\n# * [July 2015 Austin airport departures (Southwest Airlines)](#https://assets.datacamp.com/production/course_1639/datasets/austin_airport_departure_data_2015_july.csv)\n# * [Automobile miles per gallon](#https://assets.datacamp.com/production/course_1639/datasets/auto-mpg.csv)\n# * [Life expectancy at birth (Gapminder)](#https://assets.datacamp.com/production/course_1639/datasets/life_expectancy_at_birth.csv)\n# * [Stock data (messy)](#https://assets.datacamp.com/production/course_1639/datasets/messy_stock_data.tsv)\n# * [Percentage of bachelor's degrees awarded to women in the USA](#https://assets.datacamp.com/production/course_1639/datasets/percent-bachelors-degrees-women-usa.csv)\n# * [Tips](#https://assets.datacamp.com/production/course_1639/datasets/tips.csv)\n# * [Titanic](#https://assets.datacamp.com/production/course_1639/datasets/titanic.csv)\n# * [2010 Austin weather](#https://assets.datacamp.com/production/course_1639/datasets/weather_data_austin_2010.csv)\n# * [World Bank World Development Indicators](#https://assets.datacamp.com/production/course_1639/datasets/world_ind_pop_data.csv)\n# * [World population](#https://assets.datacamp.com/production/course_1639/datasets/world_population.csv)\n# * Other data files may be found in my [DataCamp repository](#https://github.com/trenton3983/DataCamp/tree/master/data)\n\n# # pandas DataFrames\n#\n# ***Course Description***\n#\n# Pandas DataFrames are the most widely used in-memory representation of complex data collections within Python. Whether in finance, scientific fields, or data science, a familiarity with Pandas is essential. This course teaches you to work with real-world data sets containing both string and numeric data, often structured around time series. You will learn powerful analysis, selection, and visualization techniques in this course.\n\n# ## Data ingestion & inspection\n#\n# In this chapter, you will be introduced to Panda's DataFrames. You will use Pandas to import and inspect a variety of datasets, ranging from population data obtained from The World Bank to monthly stock data obtained via Yahoo! Finance. You will also practice building DataFrames from scratch, and become familiar with Pandas' intrinsic data visualization capabilities.\n\n# ### Review pandas DataFrames\n#\n# * Example: DataFrame of Apple Stock data\n\n#%%\n\nAAPL = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n index_col='Date', parse_dates=True)\n\n#%%\n\nAAPL.head()\n\n\n# * The rows are labeled by a special data structure called an Index.\n# * Indexes in Pandas are tailored lists of labels that permit fast look-up and some powerful relational operations.\n# * The index labels in the AAPL DataFrame are dates in reverse chronological order.\n# * Labeled rows & columns improves the clarity and intuition of many data analysis tasks.\n\n#%%\n\ntype(AAPL)\n\n#%%\n\nAAPL.shape\n\n#%%\n\nAAPL.columns\n\n#%%\n\ntype(AAPL.columns)\n\n#%%\n\nAAPL.index\n\n#%%\n\ntype(AAPL.index)\n\n\n# * DataFrames can be sliced like NumPy arrays or Python lists using colons to specify the start, end and stride of a slice.\n\n#%%\n\n# Start of the DataFrame to the 5th row, inclusive of all columns\nAAPL.iloc[:5, :]\n\n#%%\n\n# Start at the 5th last row to the end of the DataFrame using a negative index\nAAPL.iloc[-5:, :]\n\n#%%\n\nAAPL.head()\n\n#%%\n\nAAPL.tail()\n\n#%%\n\nAAPL.info()\n\n#%%\n\nAAPL.Close.plot(kind='line')\n\n# Add first subplot\nplt.subplot(2, 1, 1)\nAAPL.Close.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Close')\nplt.ylabel('Value - $')\nplt.xlabel('Year')\n\n# Add second subplot\nplt.subplot(2, 1, 2)\nAAPL.Volume.plot(kind='line')\n\n# Add title and specify axis labels\nplt.title('Volume')\nplt.ylabel('Number of Shares')\nplt.xlabel('Year')\n\n# Display the plots\nplt.tight_layout()\nplt.show()\n\n\n# #### Broadcasting\n#\n# * Assigning scalar value to column slice broadcasts value to each row\n\n#%%\n\nAAPL.iloc[::3, -1] = np.nan # every 3rd row of Volume is now NaN\n\n#%%\n\nAAPL.head(7)\n\n#%%\n\nAAPL.info()\n\n\n# * Note Volume now has few non-null numbers\n\n# #### Series\n\n#%%\n\nlow = AAPL.Low\n\n#%%\n\ntype(low)\n\n#%%\n\nlow.head()\n\n#%%\n\nlows = low.values\n\n#%%\n\ntype(lows)\n\n#%%\n\nlows[0:5]\n\n\n# * A Pandas Series, then, is a 1D labeled NumPy array and a DataFrame is a 2D labeled array whose columns as Series\n\n# ### Exercises\n\n# #### Inspecting your data\n#\n# You can use the DataFrame methods ```.head()``` and ```.tail()``` to view the first few and last few rows of a DataFrame. In this exercise, we have imported pandas as ```pd``` and loaded population data from 1960 to 2014 as a DataFrame ```df```. This dataset was obtained from the World Bank.\n#\n# Your job is to use ```df.head()``` and ```df.tail()``` to verify that the first and last rows match a file on disk. In later exercises, you will see how to extract values from DataFrames with indexing, but for now, manually copy/paste or type values into assignment statements where needed. Select the correct answer for the first and last values in the ```'Year'``` and ```'Total Population'``` columns.\n#\n# ***Instructions***\n#\n# Possible Answers\n# * First: 1980, 26183676.0; Last: 2000, 35.\n# * First: 1960, 92495902.0; Last: 2014, 15245855.0.\n# * First: 40.472, 2001; Last: 44.5, 1880.\n# * First: CSS, 104170.0; Last: USA, 95.203.\n\n#%%\n\nwb_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_ind_pop_data.csv')\n\n#%%\n\nwb_df.head()\n\n#%%\n\nwb_df.tail()\n\n\n# #### DataFrame data types\n#\n# Pandas is aware of the data types in the columns of your DataFrame. It is also aware of null and ```NaN``` ('Not-a-Number') types which often indicate missing data. In this exercise, we have imported pandas as ```pd``` and read in the world population data which contains some ```NaN``` values, a value often used as a place-holder for missing or otherwise invalid data entries. Your job is to use ```df.info()``` to determine information about the total count of ```non-null``` entries and infer the total count of ```'null'``` entries, which likely indicates missing data. Select the best description of this data set from the following:\n#\n# ***Instructions***\n#\n# Possible Answers\n# * The data is all of type float64 and none of it is missing.\n# * The data is of mixed type, and 9914 of it is missing.\n# * The data is of mixed type, and 3460 float64s are missing.\n# * The data is all of type float64, and 3460 float64s are missing.\n\n# ```python\n# \n# RangeIndex: 13374 entries, 0 to 13373\n# Data columns (total 5 columns):\n# CountryName 13374 non-null object\n# CountryCode 13374 non-null object\n# Year 13374 non-null int64\n# Total Population 9914 non-null float64\n# Urban population (% of total) 13374 non-null float64\n# dtypes: float64(2), int64(1), object(2)\n# memory usage: 522.5+ KB\n# ```\n\n#%%\n\nwb_df.info()\n\n\n# #### NumPy and pandas working together\n# Pandas depends upon and interoperates with NumPy, the Python library for fast numeric array computations. For example, you can use the DataFrame attribute ```.values``` to represent a DataFrame ```df``` as a NumPy array. You can also pass pandas data structures to NumPy methods. In this exercise, we have imported pandas as ```pd``` and loaded world population data every 10 years since 1960 into the DataFrame ```df```. This dataset was derived from the one used in the previous exercise.\n#\n# Your job is to extract the values and store them in an array using the attribute ```.values```. You'll then use those values as input into the NumPy ```np.log10()``` method to compute the base 10 logarithm of the population values. Finally, you will pass the entire pandas DataFrame into the same NumPy ```np.log10()``` method and compare the results.\n#\n# ***Instructions***\n#\n# * Import ```numpy``` using the standard alias ```np```.\n# * Assign the numerical values in the DataFrame ```df``` to an array ```np_vals``` using the attribute ```values```.\n# * Pass ```np_vals``` into the NumPy method ```log10()``` and store the results in ```np_vals_log10```.\n# * Pass the entire ```df``` DataFrame into the NumPy method ```log10()``` and store the results in ```df_log10```.\n# * Inspect the output of the ```print()``` code to see the ```type()``` of the variables that you created.\n\n#%%\n\npop_df = pd.read_csv(\n r'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv')\n\n#%%\n\npop_df.info()\n\n#%%\n\n# Create array of DataFrame values: np_vals\nnp_vals = pop_df.values\n\n#%%\n\nnp_vals\n\n#%%\n\n# Create new array of base 10 logarithm values: np_vals_log10\nnp_vals_log10 = np.log10(np_vals)\n\n#%%\n\nnp_vals_log10\n\n#%%\n\n# Create array of new DataFrame by passing df to np.log10(): df_log10\npop_df_log10 = np.log10(pop_df)\n\n#%%\n\npop_df_log10\n\n#%%\n\n# Print original and new data containers\n[print(x, 'has type', type(eval(x)))\n for x in ['np_vals', 'np_vals_log10', 'pop_df', 'pop_df_log10']]\n\n\n# ***As a data scientist, you'll frequently interact with NumPy arrays, pandas Series, and pandas DataFrames, and you'll leverage a variety of NumPy and pandas methods to perform your desired computations. Understanding how NumPy and pandas work together will prove to be very useful.***\n\n# ### Building DataFrames from Scratch\n#\n# * DataFrames read in from CSV\n# ```python\n# pd.read_csv()\n# ```\n\n# * DataFrames from dict (1)\n\n#%%\n\ndata = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],\n 'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],\n 'visitors': [139, 237, 326, 456],\n 'signups': [7, 12, 3, 5]}\n\n#%%\n\nusers = pd.DataFrame(data)\n\n#%%\n\nusers\n\n\n# * DataFrames from dict (2)\n# * lists\n\n#%%\n\ncities = ['Austin', 'Dallas', 'Austin', 'Dallas']\nsignups = [7, 12, 3, 5]\nweekdays = ['Sun', 'Sun', 'Mon', 'Mon']\nvisitors = [139, 237, 326, 456]\n\nlist_labels = ['city', 'signups', 'visitors', 'weekday']\nlist_cols = [cities, signups, visitors, weekdays] # list of lists\n\nzipped = list(zip(list_labels, list_cols)) # tuples\nzipped\n\n\n# * DataFrames from dict (3)\n\n#%%\n\ndata2 = dict(zipped)\n\n#%%\n\nusers2 = pd.DataFrame(data2)\n\n#%%\n\nusers2\n\n\n# #### Broadcasting\n#\n# * Saves time by generating long lists, arrays or columns without loops\n\n#%%\n\nusers['fees'] = 0 # Broadcasts value to entire column\n\n#%%\n\nusers\n\n\n# #### Broadcasting with a dict\n\n#%%\n\nheights = [59.0, 65.2, 62.9, 65.4, 63.7, 65.7, 64.1]\n\n#%%\n\ndata = {'height': heights, 'sex': 'M'} # M is broadcast to the entire column\n\n#%%\n\nresults = pd.DataFrame(data)\n\n#%%\n\nresults\n\n\n# #### Index and columns\n#\n# * We can assign list of strings to the attributes columns and index as long as they are of suitable length.\n\n#%%\n\nresults.columns = ['height (in)', 'sex']\n\n#%%\n\nresults.index = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n\n#%%\n\nresults\n\n\n# ### Exercises\n\n# #### Zip lists to build a DataFrame\n#\n# In this exercise, you're going to make a pandas DataFrame of the top three countries to win gold medals since 1896 by first building a dictionary. ```list_keys``` contains the column names ```'Country'``` and ```'Total'```. ```list_values``` contains the full names of each country and the number of gold medals awarded. The values have been taken from [Wikipedia](#https://en.wikipedia.org/wiki/All-time_Olympic_Games_medal_table).\n#\n# Your job is to use these lists to construct a list of tuples, use the list of tuples to construct a dictionary, and then use that dictionary to construct a DataFrame. In doing so, you'll make use of the ```list()```, ```zip()```, ```dict()``` and ```pd.DataFrame()``` functions. Pandas has already been imported as pd.\n#\n# Note: The [zip()](#https://docs.python.org/3/library/functions.html#zip) function in Python 3 and above returns a special zip object, which is essentially a generator. To convert this ```zip``` object into a list, you'll need to use ```list()```. You can learn more about the ```zip()``` function as well as generators in [Python Data Science Toolbox (Part 2)](#https://www.datacamp.com/courses/python-data-science-toolbox-part-2).\n#\n# ***Instructions***\n#\n# * Zip the 2 lists ```list_keys``` and ```list_values``` together into one list of (key, value) tuples. Be sure to convert the ```zip``` object into a list, and store the result in ```zipped```.\n# * Inspect the contents of ```zipped``` using ```print()```. This has been done for you.\n# * Construct a dictionary using ```zipped```. Store the result as ```data```.\n# * Construct a DataFrame using the dictionary. Store the result as ```df```.\n\n#%%\n\nlist_keys = ['Country', 'Total']\nlist_values = [['United States', 'Soviet Union',\n 'United Kingdom'], [1118, 473, 273]]\n\n#%%\n\nzipped = list(zip(list_keys, list_values)) # tuples\nzipped\n\n#%%\n\ndata = dict(zipped)\n\n#%%\n\ndata\n\n#%%\n\ndata_df = pd.DataFrame.from_dict(data)\n\n#%%\n\ndata_df\n\n\n# #### Labeling your data\n#\n# You can use the DataFrame attribute ```df.columns``` to view and assign new string labels to columns in a pandas DataFrame.\n#\n# In this exercise, we have imported pandas as ```pd``` and defined a DataFrame ```df``` containing top Billboard hits from the 1980s (from [Wikipedia](#https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number-one_singles_of_the_1980s#1980)). Each row has the year, artist, song name and the number of weeks at the top. However, this DataFrame has the column labels ```a, b, c, d```. Your job is to use the ```df.columns``` attribute to re-assign descriptive column labels.\n#\n# ***Instructions***\n#\n# * Create a list of new column labels with ```'year'```, ```'artist'```, ```'song'```, ```'chart weeks'```, and assign it to ```list_labels```.\n# * Assign your list of labels to ```df.columns```.\n\n#%%\n\nbillboard_values = np.array([['1980', 'Blondie', 'Call Me', '6'],\n ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],\n ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']]).transpose()\nbillboard_keys = ['a', 'b', 'c', 'd']\n\nbillboard_zipped = list(zip(billboard_keys, billboard_values))\nbillboard_zipped\n\n#%%\n\nbillboard_dict = dict(billboard_zipped)\n\n#%%\n\nbillboard_dict\n\n#%%\n\nbillboard = pd.DataFrame.from_dict(billboard_dict)\n\n#%%\n\nbillboard\n\n#%%\n\n# Build a list of labels: list_labels\nlist_labels = ['year', 'artist', 'song', 'chart weeks']\n\n#%%\n\n# Assign the list of labels to the columns attribute: df.columns\nbillboard.columns = list_labels\n\n#%%\n\nbillboard\n\n\n# #### Building DataFrames with broadcasting\n#\n# You can implicitly use 'broadcasting', a feature of NumPy, when creating pandas DataFrames. In this exercise, you're going to create a DataFrame of cities in Pennsylvania that contains the city name in one column and the state name in the second. We have imported the names of 15 cities as the list ```cities```.\n#\n# Your job is to construct a DataFrame from the list of cities and the string ```'PA'```.\n#\n# ***Instructions***\n#\n# * Make a string object with the value 'PA' and assign it to state.\n# * Construct a dictionary with 2 key:value pairs: 'state':state and 'city':cities.\n# * Construct a pandas DataFrame from the dictionary you created and assign it to df\n\n#%%\n\ncities = ['Manheim', 'Preston park', 'Biglerville',\n 'Indiana', 'Curwensville', 'Crown',\n 'Harveys lake', 'Mineral springs', 'Cassville',\n 'Hannastown', 'Saltsburg', 'Tunkhannock',\n 'Pittsburgh', 'Lemasters', 'Great bend']\n\n#%%\n\n# Make a string with the value 'PA': state\nstate = 'PA'\n\n#%%\n\n# Construct a dictionary: data\ndata = {'state': state, 'city': cities}\n\n#%%\n\n# Construct a DataFrame from dictionary data: df\npa_df = pd.DataFrame.from_dict(data)\n\n#%%\n\n# Print the DataFrame\nprint(pa_df)\n\n\n# ### Importing & Exporting Data\n#\n# * Dataset: Sunspot observations collected from SILSO\n#\n# ```python\n# Format: Comma Separated values (adapted for import in spreadsheets)\n# The separator is the semicolon ';'.\n#\n# Contents:\n# Column 1-3: Gregorian calendar date\n# - Year\n# - Month\n# - Day\n# Column 4: Date in fraction of year.\n# Column 5: Daily total sunspot number. A value of -1 indicates that no number is available for that day (missing value).\n# Column 6: Daily standard deviation of the input sunspot numbers from individual stations.\n# Column 7: Number of observations used to compute the daily value.\n# Column 8: Definitive/provisional indicator. '1' indicates that the value is definitive. '0' indicates that the value is still provisional.\n# ```\n\n#%%\n\nfilepath = r'data/silso_sunspot_data_1818-2019.csv'\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';')\nsunspots.info()\n\n#%%\n\nsunspots.iloc[10:20, :]\n\n\n# #### Problems\n#\n# * CSV file has no column headers\n# * Columns 0-2: Gregorian date (year, month, day)\n# * Column 3: Date as fraction as year\n# * Column 4: Daily total sunspot number\n# * Column 5: Definitive / provisional indicator (1 OR 0)\n# * Missing values in column 4: indicated by -1\n# * Date representation inconvenient\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None)\nsunspots.iloc[10:20, :]\n\n\n# #### Using names keyword\n\n#%%\n\ncol_names = ['year', 'month', 'day', 'dec_date',\n 'tot_sunspots', 'daily_std', 'observations', 'definite']\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';', header=None, names=col_names)\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (1)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values='-1')\nsunspots.iloc[10:20, :]\n\n\n# #### Using na_values keyword (2)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values=' -1')\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Using na_values keyword (3)\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']})\nsunspots.iloc[10:20, :]\n\n\n# #### Using parse_dates keyword\n\n#%%\n\nsunspots = pd.read_csv(filepath, sep=';',\n header=None,\n names=col_names,\n na_values={'tot_sunspots': [' -1'],\n 'daily_std': ['-1']},\n parse_dates=[[0, 1, 2]])\nsunspots.iloc[10:20, :]\n\n\n# #### Inspecting DataFrame\n\n#%%\n\nsunspots.info()\n\n\n# #### Using dates as index\n\n#%%\n\nsunspots.index = sunspots['year_month_day']\nsunspots.index.name = 'date'\nsunspots.iloc[10:20, :]\n\n#%%\n\nsunspots.info()\n\n\n# #### Trimming redundant columns\n\n#%%\n\ncols = ['tot_sunspots', 'daily_std', 'observations', 'definite']\nsunspots = sunspots[cols]\nsunspots.iloc[10:20, :]\n\n\n# #### Writing files\n#\n# ```python\n# out_csv = 'sunspots.csv'\n# sunspots.to_csv(out_csv)\n# out_tsv = 'sunspots.tsv'\n# sunspots.to_csv(out_tsv, sep='\\t')\n# out_xlsx = 'sunspots.xlsx'\n# sunspots.to_excel(out_xlsx)\n# ```\n\n# ### Exercises\n\n# #### Reading a flat file\n#\n# In previous exercises, we have preloaded the data for you using the pandas function ```read_csv()```. Now, it's your turn! Your job is to read the World Bank population data you saw earlier into a DataFrame using ```read_csv()```. The file is available in the variable ```data_file```.\n#\n# The next step is to reread the same file, but simultaneously rename the columns using the ```names``` keyword input parameter, set equal to a list of new column labels. You will also need to set ```header=0``` to rename the column labels.\n#\n# Finish up by inspecting the result with ```df.head()``` and ```df.info()``` in the IPython Shell (changing ```df``` to the name of your DataFrame variable).\n#\n# ```pandas``` has already been imported and is available in the workspace as ```pd```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** with the string ***data_file*** to read the CSV file into a DataFrame and assign it to ***df1***.\n# * Create a list of new column labels - ***'year'***, ***'population'*** - and assign it to the variable ***new_labels***.\n# * Reread the same file, again using ***pd.read_csv()***, but this time, add the keyword arguments ***header=0*** and ***names=new_labels***. Assign the resulting DataFrame to ***df2***.\n# * Print both the ***df1*** and ***df2*** DataFrames to see the change in column names. This has already been done for you.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/world_population.csv'\n\n#%%\n\n# Read in the file: df1\ndf1 = pd.read_csv(data_file)\n\n#%%\n\n# Create a list of the new column labels: new_labels\nnew_labels = ['year', 'population']\n\n#%%\n\n# Read in the file, specifying the header and names parameters: df2\ndf2 = pd.read_csv(data_file, header=0, names=new_labels)\n\n#%%\n\n# Print both the DataFrames\ndf1.head()\n\n#%%\n\ndf2.head()\n\n\n# #### Delimiters, headers, and extensions\n#\n# Not all data files are clean and tidy. Pandas provides methods for reading those not-so-perfect data files that you encounter far too often.\n#\n# In this exercise, you have monthly stock data for four companies downloaded from [Yahoo Finance](#http://finance.yahoo.com/). The data is stored as one row for each company and each column is the end-of-month closing price. The file name is given to you in the variable ```file_messy```.\n#\n# In addition, this file has three aspects that may cause trouble for lesser tools: multiple header lines, comment records (rows) interleaved throughout the data rows, and space delimiters instead of commas.\n#\n# Your job is to use pandas to read the data from this problematic ```file_messy``` using non-default input options with ```read_csv()``` so as to tidy up the mess at read time. Then, write the cleaned up data to a CSV file with the variable ```file_clean``` that has been prepared for you, as you might do in a real data workflow.\n#\n# You can learn about the option input parameters needed by using ```help()``` on the pandas function ```pd.read_csv()```.\n#\n# ***Instructions***\n#\n# * Use ***pd.read_csv()*** without using any keyword arguments to read ***file_messy*** into a pandas DataFrame ***df1***.\n# * Use ***.head()*** to print the first 5 rows of ***df1*** and see how messy it is. Do this in the IPython Shell first so you can see how modifying ***read_csv()*** can clean up this mess.\n# * Using the keyword arguments ***delimiter=' '***, ***header=3*** and ***comment='#'***, use ***pd.read_csv()*** again to read ***file_messy*** into a new DataFrame ***df2***.\n# * Print the output of ***df2.head(***) to verify the file was read correctly.\n# * Use the DataFrame method ***.to_csv()*** to save the DataFrame ***df2*** to the variable ***file_clean***. Be sure to specify ***index=False***.\n# * Use the DataFrame method ***.to_excel()*** to save the DataFrame ***df2*** to the file ***'file_clean.xlsx'***. Again, remember to specify ***index=False***\n\n#%%\n\n# Read the raw file as-is: df1\nfile_messy = 'DataCamp-master/11-pandas-foundations/_datasets/messy_stock_data.tsv'\ndf1 = pd.read_csv(file_messy)\n\n#%%\n\n# Print the output of df1.head()\ndf1.head()\n\n#%%\n\n# Read in the file with the correct parameters: df2\ndf2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')\n\n#%%\n\n# Print the output of df2.head()\ndf2.head()\n\n\n# #### save files\n#\n# ```python\n# # Save the cleaned up DataFrame to a CSV file without the index\n# df2.to_csv(file_clean, index=False)\n# # Save the cleaned up DataFrame to an excel file without the index\n# df2.to_excel('file_clean.xlsx', index=False)\n# ```\n\n# ### Plotting with Pandas\n\n#%%\n\ncols = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']\naapl = pd.read_csv(r'DataCamp-master/11-pandas-foundations/_datasets/AAPL.csv',\n names=cols,\n index_col='date',\n parse_dates=True,\n header=0,\n na_values='null')\n\n#%%\n\naapl.head()\n\n#%%\n\naapl.info()\n\n#%%\n\naapl.tail()\n\n\n# #### Plotting arrays (matplotlib)\n\n#%%\n\nclose_arr = aapl['close'].values\n\n#%%\n\ntype(close_arr)\n\n#%%\n\nplt.plot(close_arr)\n\n\n# #### Plotting Series (matplotlib)\n\n#%%\n\nclose_series = aapl['close']\n\n#%%\n\ntype(close_series)\n\n#%%\n\nplt.plot(close_series)\n\n\n# #### Plotting Series (pandas)\n\n#%%\n\nclose_series.plot()\n\n\n# #### Plotting DataFrames (pandas)\n\n#%%\n\naapl.plot()\n\n\n# #### Plotting DataFrames (matplotlib)\n\n#%%\n\nplt.plot(aapl)\n\n\n# #### Fixing Scales\n\n#%%\n\naapl.plot()\nplt.yscale('log')\nplt.show()\n\n\n# #### Customizing plots\n\n#%%\n\naapl['open'].plot(color='b', style='.-', legend=True)\naapl['close'].plot(color='r', style='.', legend=True)\nplt.axis(('2000', '2001', 0, 10))\nplt.show()\n\n\n# #### Saving Plots\n\n#%%\n\naapl.loc['2001':'2004', ['open', 'close', 'high', 'low']].plot()\n\nplt.savefig('aapl.png')\nplt.savefig('aapl.jpg')\nplt.savefig('aapl.pdf')\n\nplt.show()\n\n\n# ### Exercises\n\n# #### Plotting series using pandas\n#\n# Data visualization is often a very effective first step in gaining a rough understanding of a data set to be analyzed. Pandas provides data visualization by both depending upon and interoperating with the matplotlib library. You will now explore some of the basic plotting mechanics with pandas as well as related matplotlib options. We have pre-loaded a pandas DataFrame ```df``` which contains the data you need. Your job is to use the DataFrame method ```df.plot()``` to visualize the data, and then explore the optional matplotlib input parameters that this ```.plot()``` method accepts.\n#\n# The pandas ```.plot()``` method makes calls to matplotlib to construct the plots. This means that you can use the skills you've learned in previous visualization courses to customize the plot. In this exercise, you'll add a custom title and axis labels to the figure.\n#\n# Before plotting, inspect the DataFrame in the IPython Shell using ```df.head()```. Also, use ```type(df)``` and note that it is a single column DataFrame.\n#\n# ***Instructions***\n#\n# * Create the plot with the DataFrame method ***df.plot()***. Specify a ***color*** of ***'red'***.\n# * Note: ***c*** and ***color*** are interchangeable as parameters here, but we ask you to be explicit and specify ***color***.\n# * Use ***plt.title()*** to give the plot a title of ***'Temperature in Austin'***.\n# * Use ***plt.xlabel()*** to give the plot an x-axis label of ***'Hours since midnight August 1, 2010'***.\n# * Use ***plt.ylabel()*** to give the plot a y-axis label of ***'Temperature (degrees F)'***.\n# * Finally, display the plot using ***plt.show()***\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, usecols=['Temperature'])\n\n#%%\n\ndf.info()\n\n#%%\n\ndf.head()\n\n#%%\n\n# Create a plot with color='red'\ndf.plot(color='r')\n\n# Add a title\nplt.title('Temperature in Austin')\n\n# Specify the x-axis label\nplt.xlabel('Hours since midnight August 1, 2010')\n\n# Specify the y-axis label\nplt.ylabel('Temperature (degrees F)')\n\n# Display the plot\nplt.show()\n\n\n# #### Plotting DataFrames\n#\n# Comparing data from several columns can be very illuminating. Pandas makes doing so easy with multi-column DataFrames. By default, calling ```df.plot()``` will cause pandas to over-plot all column data, with each column as a single line. In this exercise, we have pre-loaded three columns of data from a weather data set - temperature, dew point, and pressure - but the problem is that pressure has different units of measure. The pressure data, measured in Atmospheres, has a different vertical scaling than that of the other two data columns, which are both measured in degrees Fahrenheit.\n#\n# Your job is to plot all columns as a multi-line plot, to see the nature of vertical scaling problem. Then, use a list of column names passed into the DataFrame ```df[column_list]``` to limit plotting to just one column, and then just 2 columns of data. When you are finished, you will have created 4 plots. You can cycle through them by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# As in the previous exercise, inspect the DataFrame ```df``` in the IPython Shell using the ```.head()``` and ```.info()``` methods.\n#\n# ***Instructions***\n#\n# * Plot all columns together on one figure by calling ***df.plot()***, and noting the vertical scaling problem.\n# * Plot all columns as subplots. To do so, you need to specify ***subplots=True*** inside ***.plot()***.\n# * Plot a single column of dew point data. To do this, define a column list containing a single column name ***'Dew Point (deg F)'***, and call ***df[column_list1].plot()***.\n# * Plot two columns of data, ***'Temperature (deg F)'*** and ***'Dew Point (deg F)'***. To do this, define a list containing those column names and pass it into ***df[]***, as ***df[column_list2].plot()***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf = pd.read_csv(data_file, parse_dates=[3], index_col='Date')\ndf.head()\n\n#%%\n\n# Plot all columns (default)\ndf.plot()\nplt.show()\n\n#%%\n\n# Plot all columns as subplots\ndf.plot(subplots=True)\nplt.show()\n\n#%%\n\n# Plot just the Dew Point data\ncolumn_list1 = ['DewPoint']\ndf[column_list1].plot()\nplt.show()\n\n#%%\n\n# Plot the Dew Point and Temperature data, but not the Pressure data\ncolumn_list2 = ['Temperature', 'DewPoint']\ndf[column_list2].plot()\nplt.show()\n\n\n# ## Exploratory Data Analysis\n#\n# Having learned how to ingest and inspect your data, you will next explore it visually as well as quantitatively. This process, known as exploratory data analysis (EDA), is a crucial component of any data science project. Pandas has powerful methods that help with statistical and visual EDA. In this chapter, you will learn how and when to apply these techniques.\n\n# ### Visual exploratory data analysis\n\n# #### The Iris Dataset\n#\n# * Famous dataset in pattern recognition\n# * 150 observations, 4 features each\n# * Sepal length\n# * Sepal width\n# * Petal length\n# * Petal width\n# * 3 species:\n# * setosa\n# * versicolor\n# * virginica\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/iris.csv'\niris = pd.read_csv(data_file)\n\n#%%\n\niris.shape\n\n#%%\n\niris.head()\n\n\n# #### Line plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)')\n\n\n# #### Scatter Plot\n\n#%%\n\niris.plot(x='sepal length (cm)', y='sepal width (cm)',\n kind='scatter')\nplt.xlabel('sepal length (cm)')\nplt.ylabel('sepal width (cm)')\n\n\n# #### Box Plot\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='box')\nplt.ylabel('sepal length (cm)')\n\n\n# #### Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist')\nplt.xlabel('sepal length (cm)')\n\n\n# #### Histogram Options\n#\n# * **bins** (integer): number of intervals or bins\n# * **range** (tuple): extrema of bins (minimum, maximum)\n# * **density** (boolean): whether to normalized to one - formerly this was **normed**\n# * **cumulative** (boolean): computer Cumulative Distributions Function (CDF)\n# * ... more matplotlib customizations\n\n# #### Customizing Histogram\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True)\nplt.xlabel('sepal length (cm)')\n\n\n# #### Cumulative Distribution\n\n#%%\n\niris.plot(y='sepal length (cm)',\n kind='hist',\n bins=30,\n range=(4, 8),\n density=True,\n cumulative=True)\nplt.xlabel('sepal length (cm)')\nplt.title('Cumulative Distribution Function (CDF)')\n\n\n# #### Word of Warning\n#\n# * Three different DataFrame plot idioms\n# * iris.plot(kind='hist')\n# * iris.plt.hist()\n# * iris.hist()\n# * Syntax / Results differ!\n# * Pandas API still evolving: chech the documentation\n\n# ### Exercises\n\n# #### pandas line plots\n#\n# In the previous chapter, you saw that the ```.plot()``` method will place the Index values on the x-axis by default. In this exercise, you'll practice making line plots with specific columns on the x and y axes.\n#\n# You will work with a dataset consisting of monthly stock prices in 2015 for AAPL, GOOG, and IBM. The stock prices were obtained from [Yahoo Finance](#http://finance.yahoo.com/```). Your job is to plot the 'Month' column on the x-axis and the AAPL and IBM prices on the y-axis using a list of column names.\n#\n# All necessary modules have been imported for you, and the DataFrame is available in the workspace as df. Explore it using methods such as ```.head()```, ```.info()```, and ```.describe()``` to see the column names.\n#\n# ***Instructions***\n#\n# * Create a list of y-axis column names called ***y_columns*** consisting of ***'AAPL'*** and ***'IBM'***.\n# * Generate a line plot with ***x='Month'*** and ***y=y_columns*** as inputs.\n# * Give the plot a title of ***'Monthly stock prices'***.\n# * Specify the y-axis label.\n# * Display the plot.\n\n#%%\n\nvalues = [['Jan', 117.160004, 534.5224450000002, 153.309998],\n ['Feb', 128.46000700000002, 558.402511, 161.940002],\n ['Mar', 124.43, 548.002468, 160.5],\n ['Apr', 125.150002, 537.340027, 171.28999299999995],\n ['May', 130.279999, 532.1099849999998, 169.649994],\n ['Jun', 125.43, 520.51001, 162.660004],\n ['Jul', 121.300003, 625.6099849999998, 161.990005],\n ['Aug', 112.760002, 618.25, 147.889999],\n ['Sep', 110.300003, 608.419983, 144.970001],\n ['Oct', 119.5, 710.8099980000002, 140.080002],\n ['Nov', 118.300003, 742.599976, 139.419998],\n ['Dec', 105.260002, 758.880005, 137.619995]]\n\nvalues = np.array(values).transpose()\n\n#%%\n\ncols = ['Month', 'AAPL', 'GOOG', 'IBM']\n\n#%%\n\ndata_zipped = list(zip(cols, values))\n\n#%%\n\ndata_dict = dict(data_zipped)\n\n#%%\n\ndf = pd.DataFrame.from_dict(data_dict, dtype='float')\n\n#%%\n\ndf\n\n#%%\n\ndf.info()\n\n#%%\n\n# Create a list of y-axis column names: y_columns\ny_columns = ['AAPL', 'IBM']\n\n# Generate a line plot\ndf.plot(x='Month', y=y_columns)\n\n# Add the title\nplt.title('Monthly stock prices')\n\n# Add the y-axis label\nplt.ylabel('Price ($US)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas scatter plots\n#\n# Pandas scatter plots are generated using the ```kind='scatter'``` keyword argument. Scatter plots require that the x and y columns be chosen by specifying the ```x``` and ```y``` parameters inside ```.plot()```. Scatter plots also take an ```s``` keyword argument to provide the radius of each circle to plot in pixels.\n#\n# In this exercise, you're going to plot fuel efficiency (miles-per-gallon) versus horse-power for 392 automobiles manufactured from 1970 to 1982 from the [UCI Machine Learning Repository](#https://archive.ics.uci.edu/ml/datasets/Auto+MPG).\n#\n# The size of each circle is provided as a NumPy array called ```sizes```. This array contains the normalized ```'weight'``` of each automobile in the dataset.\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as df.\n#\n# ***Instructions***\n#\n# * Generate a scatter plot with ***'hp'*** on the x-axis and ***'mpg'*** on the y-axis. Specify ***s=sizes***.\n# * Add a title to the plot.\n# * Specify the x-axis and y-axis labels.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.info()\n\n#%%\n\nsizes = np.array([51.12044694, 56.78387977, 49.15557238, 49.06977358,\n 49.52823321, 78.4595872, 78.93021696, 77.41479205,\n 81.52541106, 61.71459825, 52.85646225, 54.23007578,\n 58.89427963, 39.65137852, 23.42587473, 33.41639502,\n 32.03903011, 27.8650165, 18.88972581, 14.0196956,\n 29.72619722, 24.58549713, 23.48516821, 20.77938954,\n 29.19459189, 88.67676838, 79.72987328, 79.94866084,\n 93.23005042, 18.88972581, 21.34122243, 20.6679223,\n 28.88670381, 49.24144612, 46.14174741, 45.39631334,\n 45.01218186, 73.76057586, 82.96880195, 71.84547684,\n 69.85320595, 102.22421043, 93.78252358, 110.,\n 36.52889673, 24.14234281, 44.84805372, 41.02504618,\n 20.51976563, 18.765772, 17.9095202, 17.75442285,\n 13.08832041, 10.83266174, 14.00441945, 15.91328975,\n 21.60597587, 18.8188451, 21.15311208, 24.14234281,\n 20.63083317, 76.05635059, 80.05816704, 71.18975117,\n 70.98330444, 56.13992036, 89.36985382, 84.38736544,\n 82.6716892, 81.4149056, 22.60363518, 63.06844313,\n 69.92143863, 76.76982089, 69.2066568, 35.81711267,\n 26.25184749, 36.94940537, 19.95069229, 23.88237331,\n 21.79608472, 26.1474042, 19.49759118, 18.36136808,\n 69.98970461, 56.13992036, 66.21810474, 68.02351436,\n 59.39644014, 102.10046481, 82.96880195, 79.25686195,\n 74.74521151, 93.34830013, 102.05923292, 60.7883734,\n 40.55589449, 44.7388015, 36.11079464, 37.9986264,\n 35.11233175, 15.83199594, 103.96451839, 100.21241654,\n 90.18186347, 84.27493641, 32.38645967, 21.62494928,\n 24.00218436, 23.56434276, 18.78345471, 22.21725537,\n 25.44271071, 21.36007926, 69.37650986, 76.19877818,\n 14.51292942, 19.38962134, 27.75740889, 34.24717407,\n 48.10262495, 29.459795, 32.80584831, 55.89556844,\n 40.06360581, 35.03982309, 46.33599903, 15.83199594,\n 25.01226779, 14.03498009, 26.90404245, 59.52231336,\n 54.92349014, 54.35035315, 71.39649768, 91.93424995,\n 82.70879915, 89.56285636, 75.45251972, 20.50128352,\n 16.04379287, 22.02531454, 11.32159874, 16.70430249,\n 18.80114574, 18.50153068, 21.00322336, 25.79385418,\n 23.80266582, 16.65430211, 44.35746794, 49.815853,\n 49.04119063, 41.52318884, 90.72524338, 82.07906251,\n 84.23747672, 90.29816462, 63.55551901, 63.23059357,\n 57.92740995, 59.64831981, 38.45278922, 43.19643409,\n 41.81296121, 19.62393488, 28.99647648, 35.35456858,\n 27.97283229, 30.39744886, 20.57526193, 26.96758278,\n 37.07354237, 15.62160631, 42.92863291, 30.21771564,\n 36.40567571, 36.11079464, 29.70395123, 13.41514444,\n 25.27829944, 20.51976563, 27.54281821, 21.17188565,\n 20.18836167, 73.97101962, 73.09614831, 65.35749368,\n 73.97101962, 43.51889468, 46.80945169, 37.77255674,\n 39.6256851, 17.24230306, 19.49759118, 15.62160631,\n 13.41514444, 55.49963323, 53.18333207, 55.31736854,\n 42.44868923, 13.86730874, 16.48817545, 19.33574884,\n 27.3931002, 41.31307817, 64.63368105, 44.52069676,\n 35.74387954, 60.75655952, 79.87569835, 68.46177648,\n 62.35745431, 58.70651902, 17.41217694, 19.33574884,\n 13.86730874, 22.02531454, 15.75091031, 62.68013142,\n 68.63071356, 71.36201911, 76.80558184, 51.58836621,\n 48.84134317, 54.86301837, 51.73502816, 74.14661842,\n 72.22648148, 77.88228247, 78.24284811, 15.67003285,\n 31.25845963, 21.36007926, 31.60164234, 17.51450098,\n 17.92679488, 16.40542438, 19.96892459, 32.99310928,\n 28.14577056, 30.80379718, 16.40542438, 13.48998471,\n 16.40542438, 17.84050478, 13.48998471, 47.1451025,\n 58.08281541, 53.06435374, 52.02897659, 41.44433489,\n 36.60292926, 30.80379718, 48.98404972, 42.90189859,\n 47.56635225, 39.24128299, 54.56115914, 48.41447259,\n 48.84134317, 49.41341845, 42.76835191, 69.30854366,\n 19.33574884, 27.28640858, 22.02531454, 20.70504474,\n 26.33555201, 31.37264569, 33.93740821, 24.08222494,\n 33.34566004, 41.05118927, 32.52595611, 48.41447259,\n 16.48817545, 18.97851406, 43.84255439, 37.22278157,\n 34.77459916, 44.38465193, 47.00510227, 61.39441929,\n 57.77221268, 65.12675249, 61.07507305, 79.14790534,\n 68.42801405, 54.10993164, 64.63368105, 15.42864956,\n 16.24054679, 15.26876826, 29.68171358, 51.88189829,\n 63.32798377, 42.36896092, 48.6988448, 20.15170555,\n 19.24612787, 16.98905358, 18.88972581, 29.68171358,\n 28.03762169, 30.35246559, 27.20120517, 19.13885751,\n 16.12562794, 18.71277385, 16.9722369, 29.85984799,\n 34.29495526, 37.54716158, 47.59450219, 19.93246832,\n 30.60028577, 26.90404245, 24.66650366, 21.36007926,\n 18.5366546, 32.64243213, 18.5366546, 18.09999962,\n 22.70075058, 36.23351603, 43.97776651, 14.24983724,\n 19.15671509, 14.17291518, 35.25757392, 24.38356372,\n 26.02234705, 21.83420642, 25.81458463, 28.90864169,\n 28.58044785, 30.91715052, 23.6833544, 12.82391671,\n 14.63757021, 12.89709155, 17.75442285, 16.24054679,\n 17.49742615, 16.40542438, 20.42743834, 17.41217694,\n 23.58415722, 19.96892459, 20.33531923, 22.99334585,\n 28.47146626, 28.90864169, 43.43816712, 41.57579979,\n 35.01567018, 35.74387954, 48.5565546, 57.77221268,\n 38.98605581, 49.98882458, 28.25412762, 29.01845599,\n 23.88237331, 27.60710798, 26.54539622, 31.14448175,\n 34.17556473, 16.3228815, 17.0732619, 16.15842026,\n 18.80114574, 18.80114574, 19.42557798, 20.2434083,\n 20.98452475, 16.07650192, 16.07650192, 16.57113469,\n 36.11079464, 37.84783835, 27.82194848, 33.46359332,\n 29.5706502, 23.38638738, 36.23351603, 32.40968826,\n 18.88972581, 21.92965639, 28.68963762, 30.80379718])\n\n#%%\n\n# Generate a scatter plot\ndf.plot(kind='scatter', x='hp', y='mpg', s=sizes)\n\n# Add the title\nplt.title('Fuel efficiency vs Horse-power')\n\n# Add the x-axis label\nplt.xlabel('Horse-power')\n\n# Add the y-axis label\nplt.ylabel('Fuel efficiency (mpg)')\n\n# Display the plot\nplt.show()\n\n\n# #### pandas box plots\n#\n# While pandas can plot multiple columns of data in a single figure, making plots that share the same x and y axes, there are cases where two columns cannot be plotted together because their units do not match. The ```.plot()``` method can generate subplots for each column being plotted. Here, each plot will be scaled independently.\n#\n# In this exercise your job is to generate box plots for ***fuel efficiency (mpg)*** and ***weight*** from the automobiles data set. To do this in a single figure, you'll specify ```subplots=True``` inside ```.plot()``` to generate two separate plots.\n#\n# All necessary modules have been imported and the automobiles dataset is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Make a list called ***cols*** of the column names to be plotted: ***'weight'*** and ***'mpg'***.\n# * Call plot on ***df[cols]*** to generate a box plot of the two columns in a single figure. To do this, specify ***subplots=True***.\n\n#%%\n\n# Make a list of the column names to be plotted: cols\ncols = ['weight', 'mpg']\n\n# Generate the box plots\ndf[cols].plot(kind='box', subplots=True)\n\n# Display the plot\nplt.show()\n\n\n# #### pandas hist, pdf and cd\n#\n# Pandas relies on the ```.hist()``` method to not only generate histograms, but also plots of probability density functions (PDFs) and cumulative density functions (CDFs).\n#\n# In this exercise, you will work with a dataset consisting of restaurant bills that includes the amount customers tipped.\n#\n# The original dataset is provided by the [Seaborn package](#https://github.com/mwaskom/seaborn-data/blob/master/tips.csv).\n#\n# Your job is to plot a PDF and CDF for the fraction column of the tips dataset. This column contains information about what ```fraction``` of the total bill is comprised of the tip.\n#\n# Remember, when plotting the PDF, you need to specify ```normed=True``` in your call to ```.hist()```, and when plotting the CDF, you need to specify ```cumulative=True``` in addition to ```normed=True```.\n#\n# All necessary modules have been imported and the tips dataset is available in the workspace as ```df```. Also, some formatting code has been written so that the plots you generate will appear on separate rows.\n#\n# ***Instructions***\n#\n# * Plot a PDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. The range has been taken care of for you. ***ax=axes[0]*** means that this plot will appear in the first row.\n# * Plot a CDF for the values in ***fraction*** with 30 ***bins*** between 0 and 30%. Again, the range has been specified for you. To make the CDF appear on the second row, you need to specify ***ax=axes[1]***.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/tips.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# This formats the plots such that they appear on separate rows\nfig, axes = plt.subplots(nrows=2, ncols=1)\n\n# Plot the PDF\ndf.fraction.plot(ax=axes[0], kind='hist', bins=30, density=True, range=(0, .3))\n\n# Plot the CDF\ndf.fraction.plot(ax=axes[1], kind='hist', bins=30,\n density=True, cumulative=True, range=(0, .3))\n\n\n# ### Statistical Exploratory Data Analysis\n\n# #### Summarizing with describe()\n#\n# ***Describe***\n# * count: number of entires\n# * mean: average of entries\n# * std: standard deviation\n# * min: miniumum entry\n# * 25%: first quartile\n# * 50%: median or second quartile\n# * 75%: third quartile\n# * max: maximum entry\n\n#%%\n\niris.describe() # summary statistics\n\n\n# #### Counts\n\n#%%\n\niris['sepal length (cm)'].count() # Applied to Series\n\n#%%\n\niris['sepal width (cm)'].count() # Applied to Series\n\n#%%\n\niris[['petal length (cm)', 'petal width (cm)']].count() # Applied to DataFrame\n\n#%%\n\ntype(iris[['petal length (cm)', 'petal width (cm)']].count()) # Returns series\n\n\n# #### Averages\n#\n# * measures the tendency to a central value of a measurement\n\n#%%\n\niris['sepal length (cm)'].mean() # Applied to Series\n\n#%%\n\niris.mean() # Applied to entire DataFrame\n\n\n# #### Standard Deviations (std)\n#\n# * measures spread of a measurement\n\n#%%\n\niris.std()\n\n\n# #### Mean and Standard Deviation on a Bell Curve\n\n#%%\n\niris['sepal width (cm)'].plot(kind='hist', bins=30)\n\n\n# #### Medians\n#\n# * middle number of the measurements\n# * special example of a quantile\n\n#%%\n\niris.median()\n\n\n# #### Quantile\n#\n# * If q is between 0 and 1, the qth quantile of a dataset is a numerical value that splits the data into two sets\n# * one with the fraction q of smaller observations\n# * one with the fraction q of larger observations\n# * Quantiles are percentages\n# * Median is the 0.5 quantile or the 50th percentile of a dataset\n\n#%%\n\nq = 0.5\niris.quantile(q)\n\n\n# #### Inter-quartile range (IQR)\n\n#%%\n\nq = [0.25, 0.75]\niris.quantile(q)\n\n\n# #### Range\n#\n# * interval between the smallest and largest observations\n# * given by the min and max methods\n\n#%%\n\niris.min()\n\n#%%\n\niris.max()\n\n\n# #### Box Plots\n\n#%%\n\niris.plot(kind='box')\nplt.ylabel('[cm]')\n\n\n# ### Exercises\n\n# #### Fuel efficiency\n#\n# From the automobiles data set, which value corresponds to the median value of the ```'mpg'``` column? Your job is to select the ```'mpg'``` column and call the ```.median()``` method on it. The automobile DataFrame has been provided as ```df```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\ndf.median()\n\n\n# #### Bachelor's degrees awarded to women\n# In this exercise, you will investigate statistics of the percentage of Bachelor's degrees awarded to women from 1970 to 2011. Data is recorded every year for 17 different fields. This data set was obtained from the [Digest of Education Statistics](#http://nces.ed.gov/programs/digest/2013menu_tables.asp).\n#\n# Your job is to compute the minimum and maximum values of the ```'Engineering'``` column and generate a line plot of the mean value of all 17 academic fields per year. To perform this step, you'll use the ```.mean()``` method with the keyword argument ```axis='columns'```. This computes the mean across all columns per row.\n#\n# The DataFrame has been pre-loaded for you as ```df``` with the index set to ```'Year'```.\n#\n# ***Instructions***\n#\n# * Print the minimum value of the ***'Engineering'*** column.\n# * Print the maximum value of the ***'Engineering'*** column.\n# * Construct the mean percentage per year with ***.mean(axis='columns')***. Assign the result to ***mean***.\n# * Plot the average percentage per year. Since ***'Year'*** is the index of ***df***, it will appear on the x-axis of the plot. No keyword arguments are needed in your call to ***.plot()***.\n#\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/percent-bachelors-degrees-women-usa.csv'\ndf = pd.read_csv(data_file, index_col='Year')\ndf.head()\n\n#%%\n\n# Print the minimum value of the Engineering column\ndf.Engineering.min()\n\n#%%\n\n# Print the maximum value of the Engineering column\ndf.Engineering.max()\n\n#%%\n\n# Construct the mean percentage per year: mean\nmean = df.mean(axis='columns')\nmean.head()\n\n#%%\n\n# Plot the average percentage per year\nmean.plot()\n\n\n# #### Median vs mean\n#\n# In many data sets, there can be large differences in the mean and median value due to the presence of outliers.\n#\n# In this exercise, you'll investigate the mean, median, and max fare prices paid by passengers on the Titanic and generate a box plot of the fare prices. This data set was obtained from [Vanderbilt University](#http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html).\n#\n# All necessary modules have been imported and the DataFrame is available in the workspace as ```df```.\n#\n# ***Instructions***\n#\n# * Print summary statistics of the ***'fare'*** column of ***df*** with ***.describe()*** and ***print()***. Note: ***df.fare*** and ***df['fare']*** are equivalent.\n# * Generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf.fare.describe()\n\n#%%\n\ndf.fare.plot(kind='box')\n\n\n# #### Quantiles\n#\n# In this exercise, you'll investigate the probabilities of life expectancy in countries around the world. This dataset contains life expectancy for persons born each year from 1800 to 2015. Since country names change or results are not reported, not every country has values. This dataset was obtained from [Gapminder](#https://docs.google.com/a/continuum.io/spreadsheets/d/1dgOdlUEq6_V55OHZCxz5BG_0uoghJTeA6f83br5peNs/pub?range=A1:D70&gid=1&output=html#).\n#\n# First, you will determine the number of countries reported in 2015. There are a total of 260 unique countries in the entire dataset. Then, you will compute the 5th and 95th percentiles of life expectancy over the entire dataset. Finally, you will make a box plot of life expectancy every 50 years from 1800 to 2000. Notice the large change in the distributions over this period.\n#\n# The dataset has been pre-loaded into a DataFrame called ```df```.\n#\n# ***Instructions***\n#\n# * Print the number of countries reported in 2015. To do this, use the ***.count()*** method on the ***'2015'*** column of ***df***.\n# * Print the 5th and 95th percentiles of ***df***. To do this, use the ***.quantile()*** method with the list ***[0.05, 0.95]***.\n# * Generate a box plot using the list of columns provided in ***years***. This has already been done for you, so click on 'Submit Answer' to view the result!\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/life_expectancy_at_birth.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\n# Print the number of countries reported in 2015\ndf['2015'].count()\n\n#%%\n\n# Print the 5th and 95th percentiles\ndf.quantile([0.05, 0.95])\n\n#%%\n\n# Generate a box plot\nyears = ['1800', '1850', '1900', '1950', '2000']\ndf[years].plot(kind='box')\n\n\n# #### Standard deviation of temperature\n#\n# Let's use the mean and standard deviation to explore differences in temperature distributions in Pittsburgh in 2013. The data has been obtained from [Weather Underground](#https://www.wunderground.com/history/).\n#\n# In this exercise, you're going to compare the distribution of daily temperatures in January and March. You'll compute the mean and standard deviation for these two months. You will notice that while the mean values are similar, the standard deviations are quite different, meaning that one month had a larger fluctuation in temperature than the other.\n#\n# The DataFrames have been pre-loaded for you as ```january```, which contains the January data, and ```march```, which contains the March data.\n#\n# ***Instructions***\n#\n# * Compute and print the means of the January and March data using the ***.mean()*** method.\n# * Compute and print the standard deviations of the January and March data using the ***.std()*** method.\n\n#%%\n\njan_values = np.array([['2013-01-01', 28],\n ['2013-01-02', 21],\n ['2013-01-03', 24],\n ['2013-01-04', 28],\n ['2013-01-05', 30],\n ['2013-01-06', 34],\n ['2013-01-07', 29],\n ['2013-01-08', 31],\n ['2013-01-09', 36],\n ['2013-01-10', 34],\n ['2013-01-11', 47],\n ['2013-01-12', 55],\n ['2013-01-13', 62],\n ['2013-01-14', 44],\n ['2013-01-15', 30],\n ['2013-01-16', 32],\n ['2013-01-17', 32],\n ['2013-01-18', 24],\n ['2013-01-19', 42],\n ['2013-01-20', 35],\n ['2013-01-21', 18],\n ['2013-01-22', 9],\n ['2013-01-23', 11],\n ['2013-01-24', 16],\n ['2013-01-25', 16],\n ['2013-01-26', 23],\n ['2013-01-27', 23],\n ['2013-01-28', 40],\n ['2013-01-29', 59],\n ['2013-01-30', 58],\n ['2013-01-31', 32]]).transpose()\ncols = ['Date', 'Temperature']\njan_zip = list(zip(cols, jan_values))\njan_dict = dict(jan_zip)\njanuary = pd.DataFrame.from_dict(jan_dict).astype({'Temperature': np.int64})\njanuary.head()\n\n#%%\n\nmar_values = np.array([['2013-03-01', 28],\n ['2013-03-02', 26],\n ['2013-03-03', 24],\n ['2013-03-04', 28],\n ['2013-03-05', 32],\n ['2013-03-06', 34],\n ['2013-03-07', 36],\n ['2013-03-08', 32],\n ['2013-03-09', 40],\n ['2013-03-10', 55],\n ['2013-03-11', 55],\n ['2013-03-12', 40],\n ['2013-03-13', 32],\n ['2013-03-14', 30],\n ['2013-03-15', 38],\n ['2013-03-16', 36],\n ['2013-03-17', 32],\n ['2013-03-18', 34],\n ['2013-03-19', 36],\n ['2013-03-20', 32],\n ['2013-03-21', 22],\n ['2013-03-22', 28],\n ['2013-03-23', 34],\n ['2013-03-24', 34],\n ['2013-03-25', 32],\n ['2013-03-26', 34],\n ['2013-03-27', 34],\n ['2013-03-28', 37],\n ['2013-03-29', 43],\n ['2013-03-30', 43],\n ['2013-03-31', 44]]).transpose()\nmar_zip = list(zip(cols, mar_values))\nmar_dict = dict(mar_zip)\nmarch = pd.DataFrame.from_dict(mar_dict).astype({'Temperature': np.int64})\nmarch.head()\n\n#%%\n\n# Print the mean of the January and March data\njanuary.mean()\n\n#%%\n\nmarch.mean()\n\n#%%\n\n# Print the standard deviation of the January and March data\njanuary.std()\n\n#%%\n\nmarch.std()\n\n\n# ### Separating Populations with Boolean Indexing\n\n# #### Describe species column\n#\n# * contains categorical data\n# * count: number of non-null entries\n# * unique: number of distinct values\n# * top: most frequent category\n# * freq: number of occurrences of the top value\n\n#%%\n\niris.species.describe()\n\n\n# #### Unique and Factors\n\n#%%\n\niris.species.unique()\n\n\n# #### Filtering by species\n\n#%%\n\nindices = iris['species'] == 'setosa'\nsetosa = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'versicolor'\nversicolor = iris.loc[indices, :] # extract new DataFrame\n\nindices = iris['species'] == 'virginica'\nvirginica = iris.loc[indices, :] # extract new DataFrame\n\n\n# #### Checking species\n\n#%%\n\nsetosa['species'].unique()\n\n#%%\n\nversicolor['species'].unique()\n\n#%%\n\nvirginica['species'].unique()\n\n#%%\n\nsetosa.head(2)\n\n#%%\n\nversicolor.head(2)\n\n#%%\n\nvirginica.head(2)\n\n\n# #### Visual EDA: All Data\n\n#%%\n\niris.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Entire Iris Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Visual EDA: Individual Factors\n\n#%%\n\nsetosa.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Setosa Dataset')\nplt.xlabel('[cm]')\n\nversicolor.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Versicolor Dataset')\nplt.xlabel('[cm]')\n\nvirginica.plot(kind='hist',\n bins=50,\n range=(0, 8),\n alpha=0.3)\nplt.title('Virginica Dataset')\nplt.xlabel('[cm]')\n\n\n# #### Statistical EDA: describe()\n\n#%%\n\ndescribe_all = iris.describe()\ndescribe_all\n\n#%%\n\ndescribe_setosa = setosa.describe()\ndescribe_setosa\n\n#%%\n\ndescribe_versicolor = versicolor.describe()\ndescribe_versicolor\n\n#%%\n\ndescribe_virginica = virginica.describe()\ndescribe_virginica\n\n\n# #### Computing Errors\n#\n# * This is the absolute difference of the correct statistics computed in its own group from the statistic computed with the whole population divided by the correct statistics\n# * Elementwise arithmetic so no need for loops\n\n#%%\n\nerror_setosa = 100 * np.abs(describe_setosa - describe_all)\nerror_setosa = error_setosa / describe_setosa\nerror_setosa\n\n#%%\n\nerror_versicolor = 100 * np.abs(describe_versicolor - describe_all)\nerror_versicolor = error_versicolor / describe_versicolor\nerror_versicolor\n\n#%%\n\nerror_virginica = 100 * np.abs(describe_virginica - describe_all)\nerror_virginica = error_virginica / describe_virginica\nerror_virginica\n\n\n# ### Exercises\n\n# #### Filtering and counting\n#\n# How many automobiles were manufactured in Asia in the automobile dataset? The DataFrame has been provided for you as ```df```. Use filtering and the ```.count()``` member method to determine the number of rows where the ```'origin'``` column has the value ```'Asia'```.\n#\n# As an example, you can extract the rows that contain ```'US'``` as the country of origin using ```df[df['origin'] == 'US']```.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/auto-mpg.csv'\ndf = pd.read_csv(data_file)\ndf.head(3)\n\n#%%\n\ndf[df['origin'] == 'Asia'].origin.count()\n\n\n# #### Separate and summarize\n#\n# Let's use population filtering to determine how the automobiles in the US differ from the global average and standard deviation. How does the distribution of fuel efficiency (MPG) for the US differ from the global average and standard deviation?\n#\n# In this exercise, you'll compute the means and standard deviations of all columns in the full automobile dataset. Next, you'll compute the same quantities for just the US population and subtract the global values from the US values.\n#\n# All necessary modules have been imported and the DataFrame has been pre-loaded as ```df```.\n#\n# ***Instructions***\n#\n# * Compute the global mean and global standard deviations of ***df*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***global_mean*** and ***global_std***.\n# * Filter the ***'US'*** population from the ***'origin'*** column and assign the result to ***us***.\n# * Compute the US mean and US standard deviations of ***us*** using the ***.mean()*** and ***.std()*** methods. Assign the results to ***us_mean*** and ***us_std***.\n# * Print the differences between ***us_mean*** and ***global_mean*** and ***us_std*** and ***global_std***. This has already been done for you.\n\n#%%\n\n# Compute the global mean and global standard deviation: global_mean, global_std\nglobal_mean = df.mean()\nglobal_std = df.std()\n\n#%%\n\n# Filter the US population from the origin column: us\nus = df[df['origin'] == 'US']\n\n#%%\n\n# Compute the US mean and US standard deviation: us_mean, us_std\nus_mean = us.mean()\nus_std = us.std()\n\n#%%\n\n# Print the differences\nprint(us_mean - global_mean)\nprint(us_std - global_std)\n\n\n# #### Separate and plot\n#\n# Population filtering can be used alongside plotting to quickly determine differences in distributions between the sub-populations. You'll work with the Titanic dataset.\n#\n# There were three passenger classes on the Titanic, and passengers in each class paid a different fare price. In this exercise, you'll investigate the differences in these fare prices.\n#\n# Your job is to use Boolean filtering and generate box plots of the fare prices for each of the three passenger classes. The fare prices are contained in the ```'fare'``` column and passenger class information is contained in the ```'pclass'``` column.\n#\n# When you're done, notice the portions of the box plots that differ and those that are similar.\n#\n# The DataFrame has been pre-loaded for you as ```titanic```.\n#\n# ***Instructions***\n#\n# * Inside ***plt.subplots()***, specify the ***nrows*** and ***ncols*** parameters so that there are 3 rows and 1 column.\n# * Filter the rows where the ***'pclass'*** column has the values ***1*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***2*** and generate a box plot of the ***'fare'*** column.\n# * Filter the rows where the ***'pclass'*** column has the values ***3*** and generate a box plot of the ***'fare'*** column.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/titanic.csv'\ntitanic = pd.read_csv(data_file)\ntitanic.head(3)\n\n#%%\n\n# Display the box plots on 3 separate rows and 1 column\nfig, axes = plt.subplots(nrows=3, ncols=1)\n\n# Generate a box plot of the fare prices for the First passenger class\ntitanic.loc[titanic['pclass'] == 1].plot(ax=axes[0], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Second passenger class\ntitanic.loc[titanic['pclass'] == 2].plot(ax=axes[1], y='fare', kind='box')\n\n# Generate a box plot of the fare prices for the Third passenger class\ntitanic.loc[titanic['pclass'] == 3].plot(ax=axes[2], y='fare', kind='box')\n\nplt.tight_layout()\n\n\n# ## Time Series in pandas\n#\n# In this chapter, you will learn how to manipulate and visualize time series data using Pandas. You will become familiar with concepts such as upsampling, downsampling, and interpolation. You will practice using Pandas' method chaining to efficiently filter your data and perform time series analyses. From stock prices to flight timings, time series data are found in a wide variety of domains and being able to effectively work with such data can be an invaluable skill.\n\n# ### Indexing pandas time series\n\n# #### Using pandas to read datetime objects\n#\n# * read_csv() function\n# * Can read strings into datetime objects\n# * Need to specify ***parse_dates=True***\n# * ISO 8601 format\n# * ***yyyy-mm-dd hh:mm:ss***\n\n# #### Product Sales CSV - Parse dates\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n#%%\n\nsales.info()\n\n\n# #### Selecting single datetime\n\n#%%\n\nsales.loc['2015-02-19 10:59:00', 'Company']\n\n\n# #### Selecting whole day\n\n#%%\n\nsales.loc['2015-02-05']\n\n\n# #### Partial datetime string selection\n#\n# * Alternative formats:\n# * ***sales.loc['February 5, 2015']***\n# * ***sales.loc['2015-Feb-5']***\n# * Whole month: ***sales.loc['2015-02']***\n# * Whole year: ***sales.loc['2015']***\n\n# #### Selecting whole month\n\n#%%\n\nsales.loc['2015-02'].head()\n\n\n# #### Slicing using dates/times\n\n#%%\n\nsales.loc['2015-2-16':'2015-2-20']\n\n\n# #### Convert strings to datetime\n\n#%%\n\nevening_2_11 = pd.to_datetime(['2015-2-11 20:03',\n '2015-2-11 21:00',\n '2015-2-11 22:50',\n '2015-2-11 23:00'])\nevening_2_11\n\n\n# #### Reindexing DataFrame\n\n#%%\n\nsales.reindex(evening_2_11)\n\n\n# #### Filling missing values\n\n#%%\n\nsales.reindex(evening_2_11, method='ffill')\n\n#%%\n\nsales.reindex(evening_2_11, method='bfill')\n\n\n# ### Exercises\n\n# #### Reading and slicing times\n#\n# For this exercise, we have read in the same data file using three different approaches:\n#\n# ```python\n# df1 = pd.read_csv(filename)\n# df2 = pd.read_csv(filename, parse_dates=['Date'])\n# df3 = pd.read_csv(filename, index_col='Date', parse_dates=True)\n# ```\n#\n# Use the ```.head()``` and ```.info()``` methods in the IPython Shell to inspect the DataFrames. Then, try to index each DataFrame with a datetime string. Which of the resulting DataFrames allows you to easily index and slice data by dates using, for example, ```df1.loc['2010-Aug-01']```?\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv'\ndf1 = pd.read_csv(data_file)\ndf2 = pd.read_csv(data_file, parse_dates=['Date'])\ndf3 = pd.read_csv(data_file, index_col='Date', parse_dates=True)\n\n#%%\n\ndf1.head()\n\n#%%\n\ndf2.head()\n\n#%%\n\ndf3.head()\n\n\n# ***datatime slicing allowed when index is datetime***\n#\n# * doesn't work with\n# ```python\n# df1.loc['2010-Aug-01']\n# df2.loc['2010-Aug-01']\n# ```\n\n#%%\n\ndf3.loc['2010-Aug-01'].head()\n\n\n# #### Creating and using a DatetimeIndex\n#\n# The pandas Index is a powerful way to handle time series data, so it is valuable to know how to build one yourself. Pandas provides the ```pd.to_datetime()``` function for just this task. For example, if passed the list of strings ```['2015-01-01 091234','2015-01-01 091234']``` and a ```format``` specification variable, such as ```format='%Y-%m-%d %H%M%S```, pandas will parse the string into the proper datetime elements and build the datetime objects.\n#\n# In this exercise, a list of temperature data and a list of date strings has been pre-loaded for you as ```temperature_list``` and ```date_list``` respectively. Your job is to use the ```.to_datetime()``` method to build a DatetimeIndex out of the list of date strings, and to then use it along with the list of temperature data to build a pandas Series.\n#\n# ***Instructions***\n#\n# * Prepare a format string, ***time_format***, using ***'%Y-%m-%d %H:%M'*** as the desired format.\n# * Convert ***date_list*** into a ***datetime*** object by using the ***pd.to_datetime()*** function. Specify the format string you defined above and assign the result to ***my_datetimes***.\n# * Construct a pandas Series called ***time_series*** using ***pd.Series()*** with ***temperature_list*** and ***my_datetimes***. Set the ***index*** of the Series to be ***my_datetimes***.\n\n#%%\n\ndate_file = 'data/date_list.csv'\ndate_df = pd.read_csv(date_file, header=None)\n\ndate_df[0] = date_df[0].map(lambda x: x.lstrip(\" '\").rstrip(\"',\"))\n\ndate_df.head()\n\n#%%\n\ndate_list = list(date_df[0])\ndate_list[:10]\n\n#%%\n\ntemp_list = np.random.uniform(low=41.8, high=95.3, size=8759)\ntemp_list\n\n#%%\n\n# Prepare a format string: time_format\ntime_format = '%Y-%m-%d %H:%M'\n\n#%%\n\n# Convert date_list into a datetime object: my_datetimes\nmy_datetimes = pd.to_datetime(date_list, format=time_format)\nmy_datetimes\n\n#%%\n\n# Construct a pandas Series using temperature_list and my_datetimes: time_series\ntime_series = pd.Series(temp_list, index=my_datetimes)\n\n#%%\n\ntime_series.head()\n\n\n# #### Partial string indexing and slicing\n#\n# Pandas time series support \"partial string\" indexing. What this means is that even when passed only a portion of the datetime, such as the date but not the time, pandas is remarkably good at doing what one would expect. Pandas datetime indexing also supports a wide variety of commonly used datetime string formats, even when mixed.\n#\n# In this exercise, a time series that contains hourly weather data has been pre-loaded for you. This data was read using the ```parse_dates=True``` option in ```read_csv()``` with ```index_col=\"Dates\"``` so that the Index is indeed a ```DatetimeIndex```.\n#\n# All data from the ```'Temperature'``` column has been extracted into the variable ```ts0```. Your job is to use a variety of natural date strings to extract one or more values from ```ts0```.\n#\n# After you are done, you will have three new variables - ```ts1```, ```ts2```, and ```ts3```. You can slice these further to extract only the first and last entries of each. Try doing this after your submission for more practice.\n#\n# ***Instructions***\n#\n# * Extract data from ***ts0*** for a single hour - the hour from 9pm to 10pm on ***2010-10-11***. Assign it to ***ts1***.\n# * Extract data from ***ts0*** for a single day - ***July 4th, 2010*** - and assign it to ***ts2***.\n# * Extract data from ***ts0*** for the second half of December 2010 - ***12/15/2010*** to ***12/31/2010***. Assign it to ***ts3***.\n\n#%%\n\n# Extract the hour from 9pm to 10pm on '2010-10-11': ts1\nts1 = time_series.loc['2010-10-11 21:00:00':'2010-10-11 22:00:00']\nts1.head()\n\n#%%\n\n# Extract '2010-07-04' from ts0: ts2\nts2 = time_series.loc['2010-07-04']\nts2.head()\n\n#%%\n\n# Extract data from '2010-12-15' to '2010-12-31': ts3\nts3 = time_series.loc['2010-12-15':'2010-12-31']\nts3.head()\n\n\n# #### Reindexing the Index\n#\n# Reindexing is useful in preparation for adding or otherwise combining two time series data sets. To reindex the data, we provide a new index and ask pandas to try and match the old data to the new index. If data is unavailable for one of the new index dates or times, you must tell pandas how to fill it in. Otherwise, pandas will fill with ```NaN``` by default.\n#\n# In this exercise, two time series data sets containing daily data have been pre-loaded for you, each indexed by dates. The first, ```ts1```, includes weekends, but the second, ```ts2```, does not. The goal is to combine the two data sets in a sensible way. Your job is to reindex the second data set so that it has weekends as well, and then add it to the first. When you are done, it would be informative to inspect your results.\n#\n# ***Instructions***\n#\n# * Create a new time series ***ts3*** by reindexing ***ts2*** with the index of ***ts1***. To do this, call ***.reindex()*** on ***ts2*** and pass in the index of ***ts1*** (***ts1.index***).\n# * Create another new time series, ***ts4***, by calling the same ***.reindex()*** as above, but also specifiying a fill method, using the keyword argument ***method=\"ffill\"*** to forward-fill values.\n# * Add ***ts1 + ts2***. Assign the result to ***sum12***.\n# * Add ***ts1 + ts3***. Assign the result to ***sum13***.\n# * Add ***ts1 + ts4***. Assign the result to ***sum14***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reindex without fill method: ts3\nts3 = ts2.reindex(ts1.index)\nts3\n\n#%%\n\n# Reindex with fill method, using forward fill: ts4\nts4 = ts2.reindex(ts1.index, method='ffill')\nts4\n\n#%%\n\n# Combine ts1 + ts2: sum12\nsum12 = ts1 + ts2\nsum12\n\n#%%\n\n# Combine ts1 + ts3: sum13\nsum13 = ts1 + ts3\nsum13\n\n#%%\n\n# Combine ts1 + ts4: sum14\nsum14 = ts1 + ts4\nsum14\n\n\n# ### Resampling pandas time series\n\n# #### Sales Data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=True,\n index_col='Date')\nsales.head()\n\n\n# #### Resampling\n#\n# * Statistical methods over different time intervals\n# ```python\n# mean()\n# sum()\n# count()\n# # etc.\n# ```\n# * Down-sampling\n# * reduce datetime rows to slower frequency\n# * Up-sampling\n# * increase datetime rows to faster frequency\n\n# #### Aggregating means\n\n#%%\n\ndaily_mean = sales.resample('D').mean()\ndaily_mean.head()\n\n\n# #### Verifying\n\n#%%\n\ndaily_mean.loc['2015-2-2']\n\n#%%\n\nsales.loc['2015-2-2', 'Units']\n\n#%%\n\nsales.loc['2015-2-2', 'Units'].mean()\n\n\n# #### Method chaining\n\n#%%\n\nsales.resample('D').sum().head()\n\n#%%\n\nsales.resample('D').sum().max()\n\n\n# #### Resampling strings\n\n#%%\n\nsales.resample('W').count()\n\n\n# #### Resampling frequencies\n\n#%%\n\nget_ipython().run_cell_magic(\n 'html', '', '')\n\n\n# | Input | Description |\n# |------------|--------------|\n# | 'min', 'T' | minute |\n# | 'H' | hour |\n# | 'D' | day |\n# | 'B' | business day |\n# | 'W' | week |\n# | 'M' | month |\n# | 'Q' | quarter |\n# | 'A' | year |\n\n# #### Multiplying frequencies\n\n#%%\n\nsales.loc[:, 'Units'].resample('2W').sum()\n\n\n# #### Upsampling\n\n#%%\n\ntwo_days = sales.loc['2015-2-4':'2015-2-5', 'Units']\ntwo_days\n\n\n# #### Upsampling and filling\n\n#%%\n\ntwo_days.resample('4H').ffill()\n\n\n# ### Exercises\n\n# #### Resampling and frequency\n#\n# Pandas provides methods for resampling time series data. When downsampling or upsampling, the syntax is similar, but the methods called are different. Both use the concept of 'method chaining' - ```df.method1().method2().method3()``` - to direct the output from one method call to the input of the next, and so on, as a sequence of operations, one feeding into the next.\n#\n# For example, if you have hourly data, and just need daily data, pandas will not guess how to throw out the 23 of 24 points. You must specify this in the method. One approach, for instance, could be to take the mean, as in ```df.resample('D').mean()```.\n#\n# In this exercise, a data set containing hourly temperature data has been pre-loaded for you. Your job is to resample the data using a variety of aggregation methods to answer a few questions.\n#\n# ***Instructions***\n#\n# * Downsample the ***'Temperature'*** column of ***df*** to 6 hour data using ***.resample('6h')*** and ***.mean()***. Assign the result to ***df1***.\n# * Downsample the ***'Temperature'*** column of ***df*** to daily data using ***.resample('D')*** and then count the number of data points in each day with ***.count()***. Assign the result ***df2***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Downsample to 6 hour data and aggregate by mean: df1\ndf1 = df.Temperature.resample('6H').mean()\ndf1.head()\n\n#%%\n\n# Downsample to daily data and count the number of data points: df2\ndf2 = df.Temperature.resample('D').count()\ndf2.head()\n\n\n# #### Separating and resampling\n#\n# With pandas, you can resample in different ways on different subsets of your data. For example, resampling different months of data with different aggregations. In this exercise, the data set containing hourly temperature data from the last exercise has been pre-loaded.\n#\n# Your job is to resample the data using a variety of aggregation methods. The DataFrame is available in the workspace as ```df```. You will be working with the ```'Temperature'``` column.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data for August 2010 into ***august***.\n# * Use the temperature data for August and downsample to find the daily maximum temperatures. Store the result in ***august_highs***.\n# * Use partial string indexing to extract temperature data for February 2010 into ***february***.\n# * Use the temperature data for February and downsample to find the daily minimum temperatures. Store the result in ***february_lows***.\n\n#%%\n\n# Extract temperature data for August: august\naugust = df.loc['2010-08', 'Temperature']\naugust.head()\n\n#%%\n\n# Downsample to obtain only the daily highest temperatures in August: august_highs\naugust_highs = august.resample('D').max()\naugust_highs.head()\n\n#%%\n\n# Extract temperature data for February: february\nfebruary = august = df.loc['2010-02', 'Temperature']\nfebruary.head()\n\n#%%\n\n# Downsample to obtain the daily lowest temperatures in February: february_lows\nfebruary_lows = february.resample('D').min()\nfebruary_lows.head()\n\n\n# Rolling mean and frequency\n# In this exercise, some hourly weather data is pre-loaded for you. You will continue to practice resampling, this time using rolling means.\n#\n# Rolling means (or moving averages) are generally used to smooth out short-term fluctuations in time series data and highlight long-term trends. You can read more about them here.\n#\n# To use the ```.rolling()``` method, you must always use method chaining, first calling ```.rolling()``` and then chaining an aggregation method after it. For example, with a Series ```hourly_data```, ```hourly_data.rolling(window=24).mean()``` would compute new values for each hourly point, based on a 24-hour window stretching out behind each point. The frequency of the output data is the same: it is still hourly. Such an operation is useful for smoothing time series data.\n#\n# Your job is to resample the data using the combination of ```.rolling()``` and ```.mean()```. You will work with the same DataFrame ```df``` from the previous exercise.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract temperature data from August 1 2010 to August 15 2010. Assign to ***unsmoothed***.\n# * Use ***.rolling()*** with a 24 hour window to smooth the mean temperature data. Assign the result to ***smoothed***.\n# * Use a dictionary to create a new DataFrame ***august*** with the time series ***smoothed*** and ***unsmoothed*** as columns.\n# * Plot both the columns of ***august*** as line plots using the ***.plot()*** method.\n\n#%%\n\n# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed\nunsmoothed = df['Temperature']['2010-Aug-01':'2010-Aug-15']\nunsmoothed.head()\n\n#%%\n\n# Apply a rolling mean with a 24 hour window: smoothed\nsmoothed = df['Temperature']['2010-Aug-01':\n '2010-Aug-15'].rolling(window=24).mean()\nsmoothed.iloc[20:30]\n\n#%%\n\n# Create a new DataFrame with columns smoothed and unsmoothed: august\naugust = pd.DataFrame({'smoothed': smoothed, 'unsmoothed': unsmoothed})\naugust.head()\n\n#%%\n\n# Plot both smoothed and unsmoothed data using august.plot().\naugust.plot()\n\n\n# #### Resample and roll with it\n#\n# As of pandas version 0.18.0, the interface for applying rolling transformations to time series has become more consistent and flexible, and feels somewhat like a ```groupby``` (If you do not know what a ```groupby``` is, don't worry, you will learn about it in the next course!).\n#\n# You can now flexibly chain together resampling and rolling operations. In this exercise, the same weather data from the previous exercises has been pre-loaded for you. Your job is to extract one month of data, resample to find the daily high temperatures, and then use a rolling and aggregation operation to smooth the data.\n#\n# ***Instructions***\n#\n# * Use partial string indexing to extract August 2010 temperature data, and assign to ***august***.\n# * Resample to daily frequency, saving the maximum daily temperatures, and assign the result to ***daily_highs***.\n# * As part of one long method chain, repeat the above resampling (or you can re-use ***daily_highs***) and then combine it with ***.rolling()*** to apply a 7 day ***.mean()*** (with ***window=7*** inside ***.rolling()***) so as to smooth the daily highs. Assign the result to ***daily_highs_smoothed*** and print the result.\n\n#%%\n\n# Extract the August 2010 data: august\naugust = df['Temperature']['2010-08']\naugust.head()\n\n#%%\n\n# Resample to daily data, aggregating by max: daily_highs\ndaily_highs = august.resample('D').max()\ndaily_highs.head()\n\n#%%\n\n# Use a rolling 7-day window with method chaining to smooth the daily high temperatures in August\ndaily_highs_smoothed = daily_highs.rolling(window=7).mean()\ndaily_highs_smoothed.head(10)\n\n\n# ### Manipulating pandas time series\n\n# #### Sales data\n\n#%%\n\nsales = pd.read_csv('data/sales_data/sales-feb-2015.csv',\n parse_dates=['Date'])\nsales.head()\n\n\n# #### String methods\n\n#%%\n\nsales['Company'].str.upper().head()\n\n\n# #### Substring matching\n\n#%%\n\nsales['Product'].str.contains('ware').head()\n\n\n# #### Boolean arithmetic\n\n#%%\n\nprint(True + False)\nprint(True + True)\nprint(False + False)\n\n\n# #### Boolean reductions\n\n#%%\n\nsales['Product'].str.contains('ware').sum()\n\n\n# #### Datetime methods\n\n#%%\n\nsales['Date'].dt.hour.head()\n\n\n# #### Set timezone\n\n#%%\n\ncentral = sales['Date'].dt.tz_localize('US/Central')\ncentral.head()\n\n\n# #### Convert timezone\n\n#%%\n\ncentral.dt.tz_convert('US/Eastern').head()\n\n\n# #### Method chaining\n\n#%%\n\nsales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern').head()\n\n\n# #### World Population\n\n#%%\n\npopulation = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/world_population.csv',\n parse_dates=True,\n index_col='Date')\npopulation\n\n\n# #### Upsample population\n\n#%%\n\npopulation.resample('A').first().head(11)\n\n\n# #### Interpolate missing data\n\n#%%\n\npopulation.resample('A').first().interpolate('linear').head(11)\n\n\n# ### Exercises\n\n# #### Method chaining and filtering\n#\n# We've seen that pandas supports method chaining. This technique can be very powerful when cleaning and filtering data.\n#\n# In this exercise, a DataFrame containing flight departure data for a single airline and a single airport for the month of July 2015 has been pre-loaded. Your job is to use ```.str()``` filtering and method chaining to generate summary statistics on flight delays each day to Dallas.\n#\n# ***Instructions***\n#\n# * Use ***.str.strip()*** to strip extra whitespace from ***df.columns***. Assign the result back to ***df.columns***.\n# * In the ***'Destination Airport'*** column, extract all entries where Dallas (***'DAL'***) is the destination airport. Use ***.str.contains('DAL')*** for this and store the result in ***dallas***.\n# * Resample ***dallas*** such that you get the total number of departures each day. Store the result in ***daily_departures***.\n# * Generate summary statistics for daily Dallas departures using ***.describe()***. Store the result in ***stats***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True,\n index_col='Date (MM/DD/YYYY)')\ndf.head()\n\n#%%\n\n# Strip extra whitespace from the column names: df.columns\nprint(f'Before: \\n {df.columns}')\ndf.columns = df.columns.str.strip()\nprint(f'After: \\n {df.columns}')\n\n#%%\n\n# Extract data for which the destination airport is Dallas: dallas\ndallas = df['Destination Airport'].str.contains('DAL')\ndallas.head()\n\n#%%\n\n# Compute the total number of Dallas departures each day: daily_departures\ndaily_departures = dallas.resample('D').sum()\ndaily_departures.head()\n\n#%%\n\n# Generate the summary statistics for daily Dallas departures: stats\nstats = daily_departures.describe()\nstats\n\n\n# #### Missing values and interpolation\n#\n# One common application of interpolation in data analysis is to fill in missing data.\n#\n# In this exercise, noisy measured data that has some dropped or otherwise missing values has been loaded. The goal is to compare two time series, and then look at summary statistics of the differences. The problem is that one of the data sets is missing data at some of the times. The pre-loaded data ```ts1``` has value for all times, yet the data set ```ts2``` does not: it is missing data for the weekends.\n#\n# Your job is to first interpolate to fill in the data for all days. Then, compute the differences between the two data sets, now that they both have full support for all times. Finally, generate the summary statistics that describe the distribution of differences.\n#\n# ***Instructions***\n#\n# * Replace the index of ***ts2*** with that of ***ts1***, and then fill in the missing values of ***ts2*** by using ***.interpolate(how='linear')***. Save the result as ***ts2_interp***.\n# * Compute the difference between ***ts1*** and ***ts2_interp***. Take the absolute value of the difference with ***np.abs()***, and assign the result to ***differences***.\n# * Generate and print summary statistics of the ***differences*** with ***.describe()*** and ***print()***.\n\n#%%\n\nts1_index = pd.DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',\n '2016-07-05', '2016-07-06', '2016-07-07', '2016-07-08',\n '2016-07-09', '2016-07-10', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15', '2016-07-16',\n '2016-07-17'])\nts1_index\n\n#%%\n\nts1_values = np.array([0, 1, 2, 3, 4, 5, 6, 7,\n 8, 9, 10, 11, 12, 13, 14, 15, 16])\nts1_values\n\n#%%\n\nts1 = pd.Series(ts1_values, index=ts1_index)\nts1.head()\n\n#%%\n\nts2_index = pd.DatetimeIndex(['2016-07-01', '2016-07-04', '2016-07-05', '2016-07-06',\n '2016-07-07', '2016-07-08', '2016-07-11', '2016-07-12',\n '2016-07-13', '2016-07-14', '2016-07-15'])\nts2_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])\nts2 = pd.Series(ts2_values, index=ts2_index)\nts2.head()\n\n#%%\n\n# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp\nts2_interp = ts2.reindex(ts1.index).interpolate(how='linear')\nts2_interp\n\n#%%\n\n# Compute the absolute difference of ts1 and ts2_interp: differences\ndifferences = np.abs(ts1 - ts2_interp)\ndifferences\n\n#%%\n\n# Generate and print summary statistics of the differences\ndifferences.describe()\n\n\n# #### Time zones and conversion\n#\n# Time zone handling with pandas typically assumes that you are handling the Index of the Series. In this exercise, you will learn how to handle timezones that are associated with datetimes in the column data, and not just the Index.\n#\n# You will work with the flight departure dataset again, and this time you will select Los Angeles (```'LAX'```) as the destination airport.\n#\n# Here we will use a mask to ensure that we only compute on data we actually want. To learn more about Boolean masks, click [here](#https://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html)!\n#\n# ***Instructions***\n#\n# * Create a Boolean mask, ***mask***, such that if the ***'Destination Airport'*** column of df equals ***'LAX'***, the result is ***True***, and otherwise, it is ***False***.\n# * Use the mask to extract only the ***LAX*** rows. Assign the result to ***la***.\n# * Concatenate the two columns ***la['Date (MM/DD/YYYY)']*** and ***la['Wheels-off Time']*** with a ***' '*** space in between. Pass this ***to pd.to_datetime()*** to create a datetime array of all the times the LAX-bound flights left the ground.\n# * Use ***Series.dt.tz_localize()*** to localize the time to ***'US/Central'***.\n# * Use the ***.dt.tz_convert()*** method to convert datetimes from ***'US/Central'*** to ***'US/Pacific'***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/austin_airport_departure_data_2015_july.csv',\n skiprows=15,\n parse_dates=True)\ndf.columns = df.columns.str.strip()\ndf.head()\n\n#%%\n\n# Build a Boolean mask to filter out all the 'LAX' departure flights: mask\nmask = df['Destination Airport'] == 'LAX'\n\n#%%\n\n# Use the mask to subset the data: la\nla = df[mask]\nla.head()\n\n#%%\n\n# Combine two columns of data to create a datetime series: times_tz_none\ntimes_tz_none = pd.to_datetime(\n la['Date (MM/DD/YYYY)'] + ' ' + la['Wheels-off Time'])\ntimes_tz_none.head()\n\n#%%\n\n# Localize the time to US/Central: times_tz_central\ntimes_tz_central = times_tz_none.dt.tz_localize('US/Central')\ntimes_tz_central.head()\n\n#%%\n\n# Convert the datetimes from US/Central to US/Pacific\ntimes_tz_pacific = times_tz_central.dt.tz_convert('US/Pacific')\ntimes_tz_pacific.head()\n\n\n# ### Visualizing pandas time series\n\n# ***Topics***\n# * Line types\n# * Plot types\n# * Subplots\n\n#%%\n\nsp500 = pd.read_csv('data/sp500_2010-01-01_-_2015-12-31.csv',\n parse_dates=True,\n index_col='Date')\nsp500.head()\n\n\n# #### Pandas plot\n\n#%%\n\nsp500['Close'].plot()\n\n\n# #### Labels and title\n\n#%%\n\nsp500['Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### One week\n\n#%%\n\nsp500.loc['2012-4-1':'2012-4-7', 'Close'].plot(title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Plot styles\n\n#%%\n\nsp500.loc['2012-4', 'Close'].plot(style='k.-', title='S&P500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### More plot styles\n#\n# * Style format string\n# * color (k: black)\n# * marker (.: dot)\n# * line type (-: solid)\n#\n# | Color | Marker | Line |\n# |:--------:|:---------:|:---------:|\n# | b: blue | o: circle | : dotted |\n# | g: green | *: star | -: dashed |\n# | r: red | s: square | |\n# | c: cyan | +: plus | |\n\n# #### Area plot\n\n#%%\n\nsp500['Close'].plot(kind='area', title='S&P 500')\nplt.ylabel('Closing Price (US Dollars)')\n\n\n# #### Multiple columns\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(title='S&P 500')\n\n\n# #### Subplots\n\n#%%\n\nsp500.loc['2012', ['Close', 'Volume']].plot(subplots=True)\n\n\n# ### Exercises\n\n# #### Plotting time series, datetime indexing\n#\n# Pandas handles datetimes not only in your data, but also in your plotting.\n#\n# In this exercise, some time series data has been pre-loaded. However, we have not parsed the date-like columns nor set the index, as we have done for you in the past!\n#\n# The plot displayed is how pandas renders data with the default integer/positional index. Your job is to convert the ```'Date'``` column from a collection of strings into a collection of datetime objects. Then, you will use this converted ```'Date'``` column as your new index, and re-plot the data, noting the improved datetime awareness. After you are done, you can cycle between the two plots you generated by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# Before proceeding, look at the plot shown and observe how pandas handles data with the default integer index. Then, inspect the DataFrame ```df``` using the ```.head()``` method in the IPython Shell to get a feel for its structure.\n#\n# ***Instructions***\n#\n# * Use ***pd.to_datetime()*** to convert the ***'Date'*** column to a collection of datetime objects, and assign back to ***df.Date***.\n# * Set the index to this updated ***'Date'*** column, using ***df.set_index()*** with the optional keyword argument ***inplace=True***, so that you don't have to assign the result back to ***df***.\n# * Re-plot the DataFrame to see that the axis is now datetime aware. This code has been written for you.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n usecols=[0, 3])\ndf.head()\n\n#%%\n\n# Plot the raw data before setting the datetime index\ndf.plot()\n\n#%%\n\n# Convert the 'Date' column into a collection of datetime objects: df.Date\ndf.Date = pd.to_datetime(df.Date)\ndf.Date.head()\n\n#%%\n\n# Set the index to be the converted 'Date' column\ndf.set_index('Date', inplace=True)\ndf.head()\n\n#%%\n\n# Re-plot the DataFrame to see that the axis is now datetime aware!\ndf.plot()\n\n\n# #### Plotting date ranges, partial indexing\n#\n# Now that you have set the DatetimeIndex in your DataFrame, you have a much more powerful and flexible set of tools to use when plotting your time series data. Of these, one of the most convenient is partial string indexing and slicing. In this exercise, we've pre-loaded a full year of Austin 2010 weather data, with the index set to be the datetime parsed ```'Date'``` column as shown in the previous exercise.\n#\n# Your job is to use partial string indexing of the dates, in a variety of datetime string formats, to plot all the summer data and just one week of data together. After you are done, you can cycle between the two plots by clicking on the 'Previous Plot' and 'Next Plot' buttons.\n#\n# First, remind yourself how to extract one month of temperature data using ```'May 2010'``` as a key into ```df.Temperature[]```, and call ```head()``` to inspect the result: ```df.Temperature['May 2010'].head()```.\n#\n# ***Instructions***\n#\n# * Plot the summer temperatures using method chaining. The summer ranges from the months ***'2010-Jun'*** to ***'2010-Aug'***.\n# * Plot the temperatures for one week in June using the same method chaining, but this time indexing with ***'2010-06-10':'2010-06-17'*** before you follow up with ***.plot()***.\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf.head()\n\n#%%\n\n# Plot the summer data\ndf.Temperature['2010-Jun':'2010-Aug'].plot()\n\n#%%\n\n# Plot the one week data\ndf.Temperature['2010-06-10':'2010-06-17'].plot()\n\n\n# ## Case Study - Sunlight in Austin\n#\n# Working with real-world weather and climate data, in this chapter you will bring together and apply all of the skills you have acquired in this course. You will use Pandas to manipulate the data into a form usable for analysis, and then systematically explore it using the techniques you learned in the prior chapters. Enjoy!\n\n# ### Reading and Cleaning the Data\n\n# #### Case study\n#\n# * Comparing observed weather data from two sources\n\n# #### Climate normals of Austin, TX\n\n#%%\n\ndf_climate = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/weather_data_austin_2010.csv',\n parse_dates=True,\n index_col='Date')\ndf_climate.head()\n\n\n# #### Weather data of Austin, TX\n\n#%%\n\ndf = pd.read_csv('DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt',\n header=None)\ndf.head()\n\n\n# #### Reminder: read_csv()\n#\n# * Useful keyword options\n# * names: assigning column labels\n# * index_col: assigning index\n# * parse_dates: parsing datetimes\n# * na_values: parsing NaNs\n\n# ### Exercises\n\n# #### Reading in a data file\n#\n# Now that you have identified the method to use to read the data, let's try to read one file. The problem with real data such as this is that the files are almost never formatted in a convenient way. In this exercise, there are several problems to overcome in reading the file. First, there is no header, and thus the columns don't have labels. There is also no obvious index column, since none of the data columns contain a full date or time.\n#\n# Your job is to read the file into a DataFrame using the default arguments. After inspecting it, you will re-read the file specifying that there are no headers supplied.\n#\n# The CSV file has been provided for you as the variable ```data_file```.\n#\n# ***Instructions***\n#\n# * Import ***pandas*** as ***pd***.\n# * Read the file ***data_file*** into a DataFrame called ***df***.\n# * Print the output of ***df.head()***. This has been done for you. Notice the formatting problems in ***df***.\n# * Re-read the data using specifying the keyword argument ***header=None*** and assign it to ***df_headers***.\n# * Print the output of ***df_headers.head()***. This has already been done for you. Hit 'Submit Answer' and see how this resolves the formatting issues.\n\n#%%\n\ndata_file = 'DataCamp-master/11-pandas-foundations/_datasets/NOAA_QCLCD_2011_hourly_13904.txt'\n\n#%%\n\n# Read in the data file: df\ndf = pd.read_csv(data_file)\ndf.head()\n\n#%%\n\n# Read in the data file with header=None: df_headers\ndf_headers = pd.read_csv(data_file,\n header=None)\ndf_headers.head()\n\n\n# #### Re-assigning column names\n#\n# After the initial step of reading in the data, the next step is to clean and tidy it so that it is easier to work with.\n#\n# In this exercise, you will begin this cleaning process by re-assigning column names and dropping unnecessary columns.\n#\n# pandas has been imported in the workspace as ```pd```, and the file ```NOAA_QCLCD_2011_hourly_13904.txt``` has been parsed and loaded into a DataFrame ```df```. The comma separated string of column names, ```column_labels```, and list of columns to drop, ```list_to_drop```, have also been loaded for you.\n#\n# ***Instructions***\n#\n# * Convert the comma separated string ***column_labels*** to a list of strings using ***.split(',')***. Assign the result to ***column_labels_list***.\n# * Reassign ***df.columns*** using the list of strings ***column_labels_list***.\n# * Call ***df.drop()*** with ***list_to_drop*** and ***axis='columns'***. Assign the result to ***df_dropped***.\n# * Print ***df_dropped.head()*** to examine the result. This has already been done for you.\n#\n\n#%%\n\ncolumn_labels = 'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'\n\n#%%\n\nlist_to_drop = ['sky_conditionFlag',\n 'visibilityFlag',\n 'wx_and_obst_to_vision',\n 'wx_and_obst_to_visionFlag',\n 'dry_bulb_farenFlag',\n 'dry_bulb_celFlag',\n 'wet_bulb_farenFlag',\n 'wet_bulb_celFlag',\n 'dew_point_farenFlag',\n 'dew_point_celFlag',\n 'relative_humidityFlag',\n 'wind_speedFlag',\n 'wind_directionFlag',\n 'value_for_wind_character',\n 'value_for_wind_characterFlag',\n 'station_pressureFlag',\n 'pressure_tendencyFlag',\n 'pressure_tendency',\n 'presschange',\n 'presschangeFlag',\n 'sea_level_pressureFlag',\n 'hourly_precip',\n 'hourly_precipFlag',\n 'altimeter',\n 'record_type',\n 'altimeterFlag',\n 'junk']\n\n#%%\n\n# Split on the comma to create a list: column_labels_list\ncolumn_labels_list = column_labels.split(',')\ncolumn_labels_list\n\n#%%\n\n# Assign the new column labels to the DataFrame: df.columns\ndf.columns = column_labels_list\n\n#%%\n\n# Remove the appropriate columns: df_dropped\ndf_dropped = df.drop(list_to_drop, axis='columns')\ndf_dropped.head()\n\n\n# #### Cleaning and tidying datetime data\n#\n# In order to use the full power of pandas time series, you must construct a ```DatetimeIndex```. To do so, it is necessary to clean and transform the date and time columns.\n#\n# The DataFrame ```df_dropped``` you created in the last exercise is provided for you and pandas has been imported as ```pd```.\n#\n# Your job is to clean up the ```date``` and ```Time``` columns and combine them into a datetime collection to be used as the Index.\n#\n# ***Instructions***\n#\n# * Convert the ***'date'*** column to a string with ***.astype(str)*** and assign to ***df_dropped['date']***.\n# * Add leading zeros to the ***'Time'*** column. This has been done for you.\n# * Concatenate the new ***'date'*** and ***'Time'*** columns together. Assign to ***date_string***.\n# * Convert the ***date_string*** Series to datetime values with ***pd.to_datetime()***. Specify the ***format*** parameter.\n# * Set the index of the ***df_dropped*** DataFrame to be ***date_times***. Assign the result to ***df_clean***.\n\n#%%\n\n# Convert the date column to string: df_dropped['date']\ndf_dropped['date'] = df_dropped.date.astype(str)\n\n#%%\n\n# Pad leading zeros to the Time column: df_dropped['Time']\ndf_dropped['Time'] = df_dropped['Time'].apply(lambda x: '{:0>4}'.format(x))\n\n#%%\n\n# Concatenate the new date and Time columns: date_string\ndate_string = df_dropped['date'] + df_dropped['Time']\ndate_string.head()\n\n#%%\n\n# Convert the date_string Series to datetime: date_times\ndate_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')\ndate_times.head()\n\n#%%\n\n# Set the index to be the new date_times container: df_clean\ndf_clean = df_dropped.set_index(date_times)\ndf_clean.head()\n\n\n# #### Cleaning the numeric columns\n#\n# The numeric columns contain missing values labeled as 'M'. In this exercise, your job is to transform these columns such that they contain only numeric values and interpret missing data as NaN.\n#\n# The pandas function pd.to_numeric() is ideal for this purpose: It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN.\n#\n# A DataFrame df_clean is provided for you at the start of the exercise, and as usual, pandas has been imported as pd.\n#\n# ***Instructions***\n#\n# * Print the ***'dry_bulb_faren'*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'dry_bulb_faren'*** column to numeric values with ***pd.to_numeric()***. Specify ***errors='coerce'***.\n# * Print the transformed ***dry_bulb_faren*** temperature between 8 AM and 9 AM on June 20, 2011.\n# * Convert the ***'wind_speed***' and ***'dew_point_faren'*** columns to numeric values with ***pd.to_numeric()***. Again, specify ***errors='coerce'***.\n#\n\n#%%\n\n# Print the dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']\ndf_clean['dry_bulb_faren'] = pd.to_numeric(\n df_clean['dry_bulb_faren'], errors='coerce')\ndf_clean.dry_bulb_faren.head()\n\n#%%\n\n# Print the transformed dry_bulb_faren temperature between 8 AM and 9 AM on June 20, 2011\ndf_clean.loc['2011-6-20 08:00:00':'2011-6-20 09:00:00', 'dry_bulb_faren']\n\n#%%\n\n# Convert the wind_speed and dew_point_faren columns to numeric values\ndf_clean['wind_speed'] = pd.to_numeric(df_clean['wind_speed'], errors='coerce')\ndf_clean['dew_point_faren'] = pd.to_numeric(\n df_clean['dew_point_faren'], errors='coerce')\n\ndf_clean[['wind_speed', 'dew_point_faren']].head()\n\n\n# ### Statistical exploratory data analysis\n\n# #### Reminder: time series\n#\n# * Index selection by date time\n# * Partial datetime selection\n# * Slicing ranges of datetimes\n#\n# ```python\n# climate2010['2010-05-31 22:00:00'] # datetime\n# climate2010['2010-06-01'] # Entire day\n# climate2010['2010-04'] # Entire month\n# climate2010['2010-09':'2010-10'] # 2 months\n# ```\n\n# #### Reminder: statistics methods\n#\n# * Methods for computing statistics:\n# * describe(): summary\n# * mean(): average\n# * count(): counting entries\n# * median(): median\n# * std(): standard deviation\n\n# ### Exercises\n\n# #### Signal min, max, median\n#\n# Now that you have the data read and cleaned, you can begin with statistical EDA. First, you will analyze the 2011 Austin weather data.\n#\n# Your job in this exercise is to analyze the 'dry_bulb_faren' column and print the median temperatures for specific time ranges. You can do this using partial datetime string selection.\n#\n# The cleaned dataframe is provided in the workspace as df_clean.\n#\n# ***Instructions***\n#\n# * Select the ***'dry_bulb_faren'*** column and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the range ***'2011-Apr':'2011-Jun'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n# * Use ***.loc[]*** to select the month ***'2011-Jan'*** from ***'dry_bulb_faren'*** and print the output of ***.median()***.\n\n#%%\n\n# Print the median of the dry_bulb_faren column\ndf_clean.dry_bulb_faren.median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the time range '2011-Apr':'2011-Jun'\ndf_clean.loc['2011-Apr':'2011-Jun', 'dry_bulb_faren'].median()\n\n#%%\n\n# Print the median of the dry_bulb_faren column for the month of January\ndf_clean.loc['2011-Jan', 'dry_bulb_faren'].median()\n\n\n# #### Signal variance\n#\n# You're now ready to compare the 2011 weather data with the 30-year normals reported in 2010. You can ask questions such as, on average, how much hotter was every day in 2011 than expected from the 30-year average?\n#\n# The DataFrames ```df_clean``` and ```df_climate``` from previous exercises are available in the workspace.\n#\n# Your job is to first resample ```df_clean``` and ```df_climate``` by day and aggregate the mean temperatures. You will then extract the temperature related columns from each - ```'dry_bulb_faren'``` in ```df_clean```, and ```'Temperature'``` in ```df_climate``` - as NumPy arrays and compute the difference.\n#\n# Notice that the indexes of ```df_clean``` and ```df_climate``` are not aligned - ```df_clean``` has dates in 2011, while ```df_climate``` has dates in 2010. This is why you extract the temperature columns as NumPy arrays. An alternative approach is to use the pandas ```.reset_index()``` method to make sure the Series align properly. You will practice this approach as well.\n#\n# ***Instructions***\n#\n# * Downsample ***df_clean*** with daily frequency and aggregate by the mean. Store the result as ***daily_mean_2011***.\n# * Extract the ***'dry_bulb_faren'*** column from ***daily_mean_2011*** as a NumPy array using ***.values***. Store the result as ***daily_temp_2011***. Note: ***.values*** is an attribute, not a method, so you don't have to use ***()***.\n# * Downsample ***df_climate*** with daily frequency and aggregate by the mean. Store the result as ***daily_climate***.\n# * Extract the ***'Temperature'*** column from ***daily_climate*** using the ***.reset_index()*** method. To do this, first reset the index of ***daily_climate***, and then use bracket slicing to access ***'Temperature'***. Store the result as ***daily_temp_climate***.\n\n#%%\n\n# Downsample df_clean by day and aggregate by mean: daily_mean_2011\ndaily_mean_2011 = df_clean.resample('D').mean()\ndaily_mean_2011.head()\n\n#%%\n\n# Extract the dry_bulb_faren column from daily_mean_2011 using .values: daily_temp_2011\ndaily_temp_2011 = daily_mean_2011.dry_bulb_faren.values\ndaily_temp_2011[0:10]\n\n#%%\n\n# Downsample df_climate by day and aggregate by mean: daily_climate\ndaily_climate = df_climate.resample('D').mean()\ndaily_climate.head()\n\n#%%\n\n# Extract the Temperature column from daily_climate using .reset_index(): daily_temp_climate\ndaily_temp_climate = daily_climate.reset_index()['Temperature']\ndaily_temp_climate.head()\n\n#%%\n\n# Compute the difference between the two arrays and print the mean difference\ndifference = daily_temp_2011 - daily_temp_climate\ndifference.mean()\n\n\n# #### Sunny or cloudy\n#\n# On average, how much hotter is it when the sun is shining? In this exercise, you will compare temperatures on sunny days against temperatures on overcast days.\n#\n# Your job is to use Boolean selection to filter out sunny and overcast days, and then compute the difference of the mean daily maximum temperatures between each type of day.\n#\n# The DataFrame ```df_clean``` from previous exercises has been provided for you. The column ```'sky_condition'``` provides information about whether the day was sunny (```'CLR'```) or overcast (```'OVC'```).\n#\n# ***Instructions 1/3***\n#\n# * Get the cases in ***df_clean*** where the sky is clear. That is, when ***'sky_condition'*** equals ***'CLR'***, assigning to ***is_sky_clear***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_clear***, assigning to ***sunny***.\n# * Resample ***sunny*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\ndf_clean.head(3)\n\n#%%\n\n# Using df_clean, when is sky_condition 'CLR'?\nis_sky_clear = df_clean['sky_condition'] == 'CLR'\nis_sky_clear.head()\n\n#%%\n\n# Filter df_clean using is_sky_clear\nsunny = df_clean[is_sky_clear]\nsunny.head(3)\n\n#%%\n\n# Resample sunny by day then calculate the max\nsunny_daily_max = sunny.resample('D').max()\nsunny_daily_max.head()\n\n\n# ***Instructions 2/3***\n#\n# * Get the cases in ***df_clean*** where the sky is overcast. Using ***.str.contains()***, find when ***'sky_condition'*** contains ***'OVC'***, assigning to ***is_sky_overcast***.\n# * Use ***.loc[]*** to filter ***df_clean*** by ***is_sky_overcast***, assigning to ***overcast***.\n# * Resample ***overcast*** by day (***'D'***), and take the max to find the maximum daily temperature.\n\n#%%\n\n# Using df_clean, when does sky_condition contain 'OVC'?\nis_sky_overcast = df_clean['sky_condition'].str.contains('OVC')\n\n#%%\n\n# Filter df_clean using is_sky_overcast\novercast = df_clean[is_sky_overcast]\n\n#%%\n\n# Resample overcast by day then calculate the max\novercast_daily_max = overcast.resample('D').max()\novercast_daily_max.head()\n\n\n# ***Instructions 3/3***\n#\n# * Calculate the mean of ***sunny_daily_max***, assigning to ***sunny_daily_max_mean***.\n# * Calculate the mean of ***overcast_daily_max***, assigning to ***overcast_daily_max_mean***.\n# * Print ***sunny_daily_max_mean*** minus ***overcast_daily_max_mean***. How much hotter are sunny days?\n\n#%%\n\n# Calculate the mean of sunny_daily_max\nsunny_daily_max_mean = sunny_daily_max.mean()\nsunny_daily_max_mean\n\n#%%", "original_comment": "# Calculate the mean of overcast_daily_max\n", "target_code": "overcast_daily_max_mean = overcast_daily_max.mean()\n", "project_metadata": {"full_name": "trenton3983/DataCamp", "description": "code for DataCamp classes", "topics": [], "git_url": "git://github.com/trenton3983/DataCamp.git", "stars": 7, "watchers": 7, "forks": 15, "created": "2018-06-09T02:19:26Z", "size": 8544, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 2144066, "Python": 101837}, "last_updated": "2020-12-25T15:47:01Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "compatibility": "Strongly agree", "compatibility-score": 3, "precision": "Agree", "precision-score": 2, "coverage": "Disagree", "coverage-score": 1, "usefulness": "Disagree", "usefulness-score": 1}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "sunny_daily_max_mean - overcast_daily_max_mean\n", "model": "docstring", "intent": "# Calculate the mean of overcast_daily_max"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n\nmodel.output_shape\n\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n\n15000/2\n\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport sys\nfrom helpers import *\n#!python ../helpers.py\n\n#%%\n\n# data definitions\n\npath_data = \"../ncar-aiml-data-commons/holodec/\"\nnum_particles = 1\noutput_cols_one = [\"x\", \"y\", \"z\", \"d\"]\nscaler_one = MinMaxScaler()\nslice_idx = 15000\nsf = 2\n\n# load and normalize data (this takes approximately 2 minutes)\ntrain_inputs_scaled_one, train_outputs_one, scaler_vals_one = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx, sf=sf)\n\nvalid_inputs_scaled_one, valid_outputs_one, _ = load_scaled_datasets(path_data,\n num_particles,\n output_cols_one,\n slice_idx,\n split='valid',\n scaler_vals=scaler_vals_one, sf=sf)\n\n# extra transform step for output_cols_one in lieu of z mass\n\ntrain_outputs_scaled_one = scaler_one.fit_transform(\n train_outputs_one[output_cols_one])\nvalid_outputs_scaled_one = scaler_one.transform(\n valid_outputs_one[output_cols_one])\n\n#%%\n\ninput_shape = train_inputs_scaled_one[0, :, :].shape\noutput_shape = train_outputs_scaled_one.shape[1]\n\nprint(input_shape)\noutput_shape\n\n\n# # Train a DNN\n\n#%%\n\ninputs = Input(shape=input_shape)\nnn_dense = Flatten()(inputs)\nnn_dense = Dense(1024, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(512, activation='relu')(nn_dense)\nnn_dense = Dense(output_shape, activation='softmax')(nn_dense)\nmodel = Model(inputs, nn_dense)\n\n#%%\n\nmodel.output_shape\n\n#%%\n\nmodel.compile(optimizer='adam', loss='mse')\nmodel.summary()\n\n#%%\n\nx = train_inputs_scaled_one\nxv = valid_inputs_scaled_one\ny = train_outputs_scaled_one\nyv = valid_outputs_scaled_one\nbatch_size = 256\nepochs = 10\nverbose = 1\n\n\nmodel.fit(x, y, batch_size=batch_size, epochs=epochs,\n verbose=verbose, validation_data=(xv, yv))\n\n#%%\n\nclass DeepNeuralNetwork(object):\n \"\"\"\n A Conv2D Neural Network Model that can support arbitrary numbers of layers.\n\n Attributes:\n filters: List of number of filters in each Conv2D layer\n kernel_sizes: List of kernel sizes in each Conv2D layer\n conv2d_activation: Type of activation function for conv2d layers\n pool_sizes: List of Max Pool sizes\n dense_sizes: Sizes of dense layers\n dense_activation: Type of activation function for dense layers\n output_activation: Type of activation function for output layer\n lr: Optimizer learning rate\n optimizer: Name of optimizer or optimizer object.\n adam_beta_1: Exponential decay rate for the first moment estimates\n adam_beta_2: Exponential decay rate for the first moment estimates\n sgd_momentum: Stochastic Gradient Descent momentum\n decay: Optimizer decay\n loss: Name of loss function or loss object\n batch_size: Number of examples per batch\n epochs: Number of epochs to train\n verbose: Level of detail to provide during training\n model: Keras Model object\n \"\"\"\n\n def __init__(self, dense_sizes=(64,), dense_activation=\"relu\", output_activation=\"softmax\",\n lr=0.001, optimizer=\"adam\", adam_beta_1=0.9, adam_beta_2=0.999,\n sgd_momentum=0.9, decay=0, loss=\"mse\", batch_size=32, epochs=2, verbose=0):\n self.dense_sizes = dense_sizes\n self.dense_activation = dense_activation\n self.output_activation = output_activation\n self.lr = lr\n self.optimizer = optimizer\n self.optimizer_obj = None\n self.adam_beta_1 = adam_beta_1\n self.adam_beta_2 = adam_beta_2\n self.sgd_momentum = sgd_momentum\n self.decay = decay\n self.loss = loss\n self.batch_size = batch_size\n self.epochs = epochs\n self.verbose = verbose\n self.model = None\n\n def build_neural_network(self, input_shape, output_shape):\n \"\"\"Create Keras neural network model and compile it.\"\"\"\n conv_input = Input(shape=(input_shape), name=\"input\")\n nn_model = conv_input\n nn_model = Flatten()(nn_model)\n for h in range(len(self.dense_sizes)):\n nn_model = Dense(\n self.dense_sizes[h], activation=self.dense_activation, name=f\"dense_{h:02d}\")(nn_model)\n nn_model = Dense(\n output_shape, activation=self.output_activation, name=f\"dense_output\")(nn_model)\n self.model = Model(conv_input, nn_model)\n if self.optimizer == \"adam\":\n self.optimizer_obj = Adam(\n lr=self.lr, beta_1=self.adam_beta_1, beta_2=self.adam_beta_2, decay=self.decay)\n elif self.optimizer == \"sgd\":\n self.optimizer_obj = SGD(\n lr=self.lr, momentum=self.sgd_momentum, decay=self.decay)\n self.model.compile(optimizer=self.optimizer, loss=self.loss)\n self.model.summary()\n\n def fit(self, x, y, xv, yv):\n if len(y.shape) == 1:\n output_shape = 1\n else:\n output_shape = y.shape[1]\n input_shape = x.shape[1:]\n self.build_neural_network(input_shape, output_shape)\n self.model.fit(x, y, batch_size=self.batch_size, epochs=self.epochs,\n verbose=self.verbose, validation_data=(xv, yv))\n return self.model.history.history\n\n def predict(self, x):\n y_out = self.model.predict(x, batch_size=self.batch_size)\n return y_out\n\n def predict_proba(self, x):\n y_prob = self.model.predict(x, batch_size=self.batch_size)\n return y_prob\n\n#%%\n\n15000/2\n\n#%%\n\nmodel_name = \"dnn1\"\ndense_sizes = [7500, 2048, 1024, 512, 512, 512, 512, 512, 512]\ndense_activation = \"relu\"\nlr = 0.01\noptimizer = \"sgd\"\nloss = \"mae\"\nbatch_size = 256\nepochs = 50\nverbose = 1\n\n#%%\n\none_start = datetime.now()\n# with tf.device('/device:GPU:0'):\nmod = DeepNeuralNetwork(dense_sizes=dense_sizes, dense_activation=dense_activation,\n lr=lr, optimizer=optimizer, loss=loss, batch_size=batch_size, epochs=epochs, verbose=verbose)\nmod.fit(train_inputs_scaled_one, train_outputs_scaled_one,\n valid_inputs_scaled_one, valid_outputs_scaled_one)\n\ntrain_preds_scaled_one = pd.DataFrame(mod.predict(\n train_inputs_scaled_one), columns=output_cols_one)\nvalid_preds_scaled_one = pd.DataFrame(mod.predict(\n valid_inputs_scaled_one), columns=output_cols_one)\nprint(f\"Running model took {datetime.now() - one_start} time\")\n\n#%%", "original_comment": "# Plot a single hologram with the particles overlaid\n", "target_code": "def plot_hologram(h, img, outputs=\"none\"):\n", "project_metadata": {"full_name": "NCAR/ai4ess-hackathon-2020-notebooks", "description": null, "topics": [], "git_url": "git://github.com/NCAR/ai4ess-hackathon-2020-notebooks.git", "stars": 7, "watchers": 7, "forks": 7, "created": "2020-06-30T21:57:57Z", "size": 18992, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 51510984}, "last_updated": "2020-11-13T12:56:58Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Disagree", "coverage-score": 1, "precision": "Disagree", "precision-score": 1, "compatibility": "Disagree", "compatibility-score": 1}, {"completed_by": {"id": 3}, "compatibility": "Strongly disagree", "compatibility-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "plt.plot(mod.model.history.history['loss'])\nplt.plot(mod.model.history.history['val_loss'])\nplt.title('model loss')\nplt.ylabel('loss')\nplt.xlabel('epoch')\nplt.legend(['train', 'validation'], loc='upper right')\nplt.show()\n", "model": "docstring", "intent": "# Plot a single hologram with the particles overlaid"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# *This notebook is part of course materials for CS 345: Machine Learning Foundations and Practice at Colorado State University.\n# Original versions were created by Asa Ben-Hur.\n# The content is availabe [on GitHub](https://github.com/asabenhur/CS345).*\n#\n# *The text is released under the [CC BY-SA license](https://creativecommons.org/licenses/by-sa/4.0/), and code is released under the [MIT license](https://opensource.org/licenses/MIT).*\n#\n# \"CC-BY-SA\n#\n\n# \n# \"Open\n# \n\n#%%\n\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.datasets import load_breast_cancer\nfrom sklearn.model_selection import train_test_split\nfrom matplotlib.ticker import LinearLocator, FormatStrFormatter\nfrom mpl_toolkits.mplot3d import Axes3D\nfrom scipy.stats import norm\nfrom sklearn.datasets import make_classification\nimport numpy as np\nimport matplotlib.pyplot as plt\nget_ipython().run_line_magic('autosave', '0')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# # Logistic Regression\n#\n# Although its name would suggest otherwise, logistic regression is a classification method.\n# As we go along, it may become clear why the word regression is in there.\n\n# ### Recap: linear classifiers\n#\n# In earlier notebooks we discussed the use of a linear function to make predictions using a linear function:\n#\n# $$\n# f(\\mathbf{x}) = \\mathbf{w}^\\top \\mathbf{x} + b.\n# $$\n#\n# The class associated with $\\mathbf{x}$ is decided according to the sign of the discriminant function $f(\\mathbf{x})$.\n#\n# For example:\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0,\n n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=0.3, random_state=1)\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = np.array([-0.5, 0.1])\nb = 0\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = w[0] * x_grid + w[1] * y_grid + b\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 5,\n origin='lower',\n linewidths=(1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.1f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n\n# ### Using probabilities to quantify prediction confidence\n#\n# As users of machine learning, we are interested in making *confident* predictions. In the context of the linear discriminant function,\n# the magnitude of $f(\\mathbf{x})$ can tell us something about our confidence in the prediction. However, there is no easy way for us to interpret that value as a measure of confidence. What would help us is a way to estimate $p(y | \\mathbf{x})$.\n#\n# In this notebook we will focus on binary classification problems. In this case the only two options are $p(y=1 | \\mathbf{x})$, and $p(y=0 | \\mathbf{x})$, which satisfy:\n#\n# $$\n# p(y=0 | \\mathbf{x}) = 1 - p(y=1 | \\mathbf{x})\n# $$\n#\n# The following figure shows $p(y=1 | \\mathbf{x})$ and $p(y=0 | \\mathbf{x})$ for a hypothetical classification problem.\n\n#%%\n\nplt.style.use('bmh')\nplt.xkcd(scale=0.3)\nplt.figure(figsize=(6, 4))\nm1 = 2.5\nstd1 = 1.0\nm2 = 5.0\nstd2 = 1.0\n\n\ndef solve(m1, m2, std1, std2):\n a = 1/(2*std1**2) - 1/(2*std2**2)\n b = m2/(std2**2) - m1/(std1**2)\n c = m1**2 / (2*std1**2) - m2**2 / (2*std2**2) - np.log(std2/std1)\n return np.roots([a, b, c])\n\n\nresult = solve(m1, m2, std1, std2)\nx = np.linspace(-5, 9, 10000)\nax = plt.axes()\nax.plot(x, norm.pdf(x, m1, std1), label=\"p(y=0|x)\")\nax.plot(x, norm.pdf(x, m2, std2), label=\"p(y=1|x)\")\nax.plot(result, norm.pdf(result, m1, std1), 'o')\nplt.xlabel('x')\nplt.ylabel('p(y|x)', rotation=0, labelpad=30)\nax.arrow(result[0], norm.pdf(result, m1, std1)[0], 0, -2,\n head_width=0, head_length=0, fc='k', ec='k', color=\"yellow\")\nplt.text(result[0], -0.05, \"x*\")\nplt.xticks([])\nplt.yticks([])\nplt.legend()\n\n\n# In the above figure, $\\mathbf{x}^*$ is the point where\n# $p(y=1 | \\mathbf{x}) = p(y=0 | \\mathbf{x})$. To the right of it, we would classify examples as belonging to the positive class, and negative to the left.\n\n# We would like to learn to predict a probability $ p(y | \\mathbf{x})$ for a binary classification problem using dot products.\n# A dot product prduces arbitrary numbers, so cannot serve to produce probabilities because they need to be between 0 and 1.\n# However, we can convert a dot product to a probability by applying a \"squashing function\" to the dot product e.g. using the so-called **logistic function**:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\sigma(\\mathbf{w}^\\top \\mathbf{x})\n# $$\n#\n# where $\\sigma(s)$ is the logistic function which is defined by:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1 + e^{-s}}.\n# $$\n#\n# More explicitly, this can be expressed as:\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# Let's take a look at the graph of the logistic function (aka the sigmoid function), which demonstrates its ability to serve as a \"squashing function\":\n\n#%%\n\ndef sigmoid(s):\n return 1/(1+np.exp(-s))\n\n#%%\n\ns = np.linspace(-10, 10, 100)\nplt.figure(figsize=(5, 3))\nplt.plot(s, sigmoid(s))\nplt.xlabel(\"s\", fontsize=20)\nplt.ylabel(r\"${\\sigma}(s)$\", rotation=0, fontsize=20, labelpad=20)\n\n\n# We can express the probability $p(y|\\mathbf{x})$ as follows:\n#\n# $$\n# p(y|\\mathbf{x})= \\begin{cases} \\sigma(\\mathbf{w}^\\top \\mathbf{x}) & \\textrm{for} \\space {y}=1 \\newline\n# 1- \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\space & \\textrm{for} \\space {y}=0\n# \\end{cases}\n# $$\n#\n# The choice of the logistic function seems arbitrary. The following discussion will provide some motivation.\n#\n# First, let's consider the ratio\n#\n# $$\\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})}.$$\n#\n# This is called the **odds**.\n#\n# What did we gain? Whereas $p(y=1|\\mathbf{x})$ is a number between 0 and 1, the odds is a number between 0 and infinity.\n#\n# That's an improvement, but again there's no natural way to model that using a dot product. So instead, we will focus on the log odds:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{p(y=0|\\mathbf{x})} = \\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} .$$\n#\n# The log-odds is between negative infinity, and infinity and can therefore be modeled using a dot product.\n#\n# For your reference, here's a nice [video](https://www.youtube.com/watch?v=ARfXDSkQf1Y) that explains odds and odds ratios.\n\n# Our choice to model the log-odds using a dot product gives the following:\n#\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}.$$\n#\n# Solving for $p(y=1|\\mathbf{x})$ we find that\n#\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# In conclusion, the choice to model the log-odds ratio using a dot product gave us the motivation to use the logistic function as a \"squashing function\" to generate a probability from a dot product.\n\n# ### Exercise\n#\n# Check that solving for $p(y=1|\\mathbf{x})$ in\n# $$\\log \\frac{p(y=1|\\mathbf{x})}{1- p(y=1|\\mathbf{x})} = \\mathbf{w}^\\top \\mathbf{x}$$ indeed gives us\n# $$\n# p(y=1|\\mathbf{x})= \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n\n# ### Exercise: Properties of the logistic function\n#\n# Verify that:\n#\n# $$\n# \\sigma(s) = \\frac{1}{1+e^{-s}} = \\frac{e^s}{1+e^s}\n# $$\n#\n# $$\n# 1- \\sigma(s) = \\sigma(-s) = \\frac{1}{1+e^{s}}\n# $$\n#\n# $$\n# \\sigma'(s) = \\sigma(s)(1-\\sigma(s))\n# $$\n#\n# The following figure plots these functions:\n\n#%%\n\nplt.figure(figsize=(5, 3))\ns = np.linspace(-10, 10, 100)\nplt.plot(s, sigmoid(s), label=r'$\\sigma(s)$')\nplt.plot(s, 1-sigmoid(s), label=r'$1-\\sigma(s)$')\nplt.plot(s, sigmoid(s) * (1-sigmoid(s)), label=r\"$\\sigma'(s)$\")\n\nplt.xlabel(\"s\")\nplt.legend()\n\n\n# ### Is logistic regression really linear?\n#\n# Given that the dot product is squashed using a non-linear function, it's not clear that the resulting classifier is indeed linear. To answer this question, we first observe that\n#\n# $$\n# p(y=1|\\mathbf{x})=\\frac{e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# {e^{\\mathbf{w}^\\top \\mathbf{x}}+1}\n# $$\n#\n# $$\n# {p(y=0|\\mathbf{x})}=1-{p(y=1|\\mathbf{x})}=\\frac{1}{1 + e^{\\mathbf{w}^\\top \\mathbf{x}}}\n# $$\n#\n# To figure out how the decision boundary looks like, we consider the following equation:\n#\n# $$\n# p(y=1|\\mathbf{x})=p(y=0|\\mathbf{x})\n# $$\n#\n#\n# Solving for $\\mathbf{x}$ we get that\n# $\n# e^{\\mathbf{w}^\\top \\mathbf{x}}=1\n# $\n# i.e. the points on the decision boundary satisfy\n# $\n# \\mathbf{w}^\\top \\mathbf{x}=0\n# $, which is th equation for a hyperplane.\n#\n# Here's a plot that illustrates that:\n#\n\n#%%\n\nfig = plt.figure()\nax = fig.add_subplot(111, projection='3d')\nX = np.linspace(-10, 10, 1000)\nX1, Y1 = np.meshgrid(X, X)\n\nZ = sigmoid(X1)\nsurf = ax.plot_surface(X1, Y1, Z, linewidth=0,\n cmap=plt.get_cmap('bone'), antialiased=False)\n\nax.set_zlim(0, 1.01)\nax.set_xlim(-10, 10)\nax.set_ylim(-10, 10)\n\nax.set_zlabel('probability', rotation=0)\nax.view_init(azim=180+60, elev=40)\n\n\n# ### Maximum likelihood\n#\n# We have expressed our classification problem in the language of probabilities, and therefore, we will apply the principle of *maximum likelihood* in order to find the optimal value of the weight vector\n# $\\mathbf{w}$.\n#\n# The likelihood function is the probability of the labels $y_1,\\ldots,y_N$ given the corresponding $\\mathbf{x}_1,\\ldots, \\mathbf{x}_N$:\n#\n# \\begin{equation}\n# p(y_1,\\ldots,y_N | \\mathbf{x}_1,\\ldots, \\mathbf{x}_N) = \\prod_{i=1}^{N} p({y_i| \\mathbf{x}_i}).\n# \\end{equation}\n#\n# This is valid because we assume that the data points\n# $(\\mathbf{x}_1,y_1),\\ldots,(\\mathbf{x}_N,y_N)$ are independent, a standard assumption in machine learning.\n#\n# The likelihood function depends on the values of model parameters, and using maximum likelihood we seek to find the parameter values that maximize the likelihood function over all choices of those parameters.\n# Intuitively, this selects the parameter values that make the observed data most probable.\n#\n# Let's define $p_i = p(y_i = 1 | \\mathbf{x}_i)$.\n# Using this notation we can express $p({y_i| \\mathbf{x}_i})$ as follows:\n#\n# $$\n# p({y_i| \\mathbf{x}_i}) = p_i^{y_i} (1- p_i)^{1-y_i}.\n# $$\n#\n# Here we assumed that $y_i$ is 0 or 1 for our binary classification problem.\n# We are going to need the negative log of this probability:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log p_i - (1-y_i) \\log(1- p_i)\n# $$\n#\n# Recall that\n#\n# $$\n# p(y=1|\\mathbf{x}) = \\sigma(\\mathbf{x}^\\top\\mathbf{w}) = \\frac{1}{1 + e^{-\\mathbf{w}^\\top \\mathbf{x}}}.\n# $$\n#\n# And inserting the form of $p({y_i| \\mathbf{x}_i})$:\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n#\n\n# ### Maximizing the likelihood\n#\n# We would like to find the weight vector that maximizes the likelihood of the data, i.e. find\n# $$\n# \\max_{\\mathbf{w}} \\prod_{i=1}^{N}p(y_i|\\mathbf{x}_i)\n# $$\n# This is equivalent to maximizing the log-likelihood (since the logarithm is a monotonic function):\n# $$\n# \\max_{\\mathbf{w}} \\log \\prod_{i=1}^{N}p(y_i| \\mathbf{x}_i)\n# $$\n# The logarithm of a product is a sum of the logs of the terms so our objective becomes\n#\n# $$\n# \\max_{\\mathbf{w}} \\sum_{i=1}^{N}\\log p(y_i|\\mathbf{x}_i)\n# $$\n#\n# We'll replace maximization with minimization by considering\n#\n# $$\n# \\min_{\\mathbf{w}} -\\frac{1}{N}\\sum_{i=1}^{N}\\log(p(y_i|\\mathbf{x}_i))\n# $$\n#\n# Using the form we derived above for $p({y_i| \\mathbf{x}_i})$\n#\n# $$\n# - \\log p({y_i| \\mathbf{x}_i}) = - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right)\n# $$\n#\n# our objective now becomes:\n#\n# $$\n# \\min_{\\mathbf{w}} \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Let's take a closer look at the expression we derived for $- \\log p({y_i| \\mathbf{x}_i})$. And let's consider the case $y_i = 1$. In that case, only the first term is nonzero. When the predicted probability is equal to 1, which is what we would like, it is equal to 0, and the further away from the desired value it is, the larger the value. A similar observation occurs for $y_i=0$: In this case only the second term contributes, and this term equals to 0 when the predicted probability is equal to 0, which is the desired value for negative examples. Thus, the function\n#\n# $$\n# \\mathcal{l}^{CE}(y,\\mathbf{x}; \\mathbf{w}) = - y \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}) \\right) - (1-y) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x})\\right)\n# $$\n#\n# which is known as the **cross entropy loss** or **log loss**. It quantifies the discrepancy of the predicted probability from the desired label. To demonstrate that it indeed functions like a loss function,\n# let's plot this function for $y = 1$ and $y=0$:\n\n#%%\n\ndef cross_entropy(p, y):\n return -(y*np.log(p)+(1-y)*np.log(1-p))\n\n\nplt.figure(figsize=(5, 3))\np = np.linspace(0.01, 0.99, 100)\nplt.plot(p, cross_entropy(p, 1), label='cross-entropy for y=1')\nplt.plot(p, cross_entropy(p, 0), label='cross-entropy for y=0')\n\nplt.xlabel('probability')\nplt.ylabel('cross entropy')\nplt.legend()\n\n\n# ### Logistic regression loss\n#\n# We have finally arrived at the final form of the loss function for logistic regression:\n#\n# $$\n# J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left[ - y_i \\log \\left( \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) \\right) - (1-y_i) \\log \\left(1 - \\sigma(\\mathbf{w}^\\top \\mathbf{x}_i)\\right) \\right]\n# $$\n#\n# Unlike the case of linear regression where there is a closed-form solution, there is no such solution for the logistic regression loss function. It turns out that the loss function is convex, and therefore there is a global minimum. Gradient descent is therefore a reasonable approach.\n#\n# The gradient of our loss function is:\n# $$\n# \\nabla J(\\mathbf{w}) = \\frac{1}{N}\\sum_{i=1}^{N} \\left(\\sigma(\\mathbf{w}^\\top \\mathbf{x}_i) - y_i\\right) \\mathbf{x}_i\n# $$\n#\n\n# ### Gradient descent\n#\n# **Input:** A labeled dataset; learning rate $\\eta$\n#\n# 1. initialize $\\mathbf{w}(0)$\n# 2. for t = 0, 1, 2,... do\n# 3. $\\;\\;\\;\\;\\;\\;$ compute the gradient: $\\mathbf{g}_t$ = $\\nabla J(\\mathbf{w}(t))$\n# 4. $\\;\\;\\;\\;\\;\\;$ update the weights: $\\mathbf{w}(t + 1) = \\mathbf{w}(t) - \\eta \\mathbf{g}_t$\n# 5. $\\;\\;\\;\\;\\;\\;$ if it is time to stop, break from loop.\n# 6. end for\n# 7. return the final weights.\n#\n# This is called **batch gradient descent**.\n# The halting condition can be a specific number of iterations, or that the loss function appears to have converged, i.e. shows little change across epochs.\n\n# ### Implementation\n#\n# Let's implement our gradient descent logistic regression algorithm as a Python class.\n\n#%%\n\nclass logistic_regression:\n def __init__(self, lr=0.01, epochs=350):\n self.lr = lr\n self.epochs = epochs\n\n def decision_function(self, X):\n return 1/(1+np.exp(-np.dot(X, self.w)))\n\n def gradient(self, X, y):\n return (self.decision_function(X) - y)@X/len(X)\n\n def loss(self, X, y):\n pred = self.decision_function(X)\n loss_values = - y * np.log(pred) - (1 - y) * np.log(1 - pred)\n return np.mean(loss_values)\n\n def fit(self, X, y):\n self.w = np.zeros(X.shape[1])\n # save a history of loss values\n self.loss_history = [self.loss(X, y)]\n for epoch in range(self.epochs):\n self.w = self.w - self.lr * self.gradient(X, y)\n self.loss_history.append(self.loss(X, y))\n\n def predict(self, X):\n pred = np.where(self.decision_function(X) >= .5, 1, 0)\n return np.squeeze(pred)\n\n#%%\n\nN = 20\nX = np.linspace(-10, 10, 20)\ny = np.where(X >= 0, 1, 0)\nX = X.reshape(-1, 1)\nprint(X.shape, y.shape)\n\nplt.style.use('default')\nplt.figure(figsize=(5, 3))\nplt.scatter(X, y, alpha=0.8)\nplt.xlabel(\"input\")\nplt.ylabel(\"class label\")\n\n#%%\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.2, shuffle=True)\n\n#%%\n\nlr = logistic_regression()\nlr.fit(X_train, y_train)\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n#%%\n\ny_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n\n#%%\n\nX, y = make_classification(n_samples=100, n_features=2, n_informative=2,\n n_redundant=0, n_repeated=0, n_classes=2,\n n_clusters_per_class=1, class_sep=0.3,\n random_state=1)\nlr = logistic_regression(0.01, 500)\nlr.fit(X, y)\ny_pred = lr.predict(X)\nprint('accuracy: ', np.sum(y_pred == y)/len(y))\n\n\n# Let's plot the resulting decision boundary and weight vector:\n\n#%%\n\nplt.style.use('default')\nplt.scatter(X[:, 0], X[:, 1], c=1-y, alpha=0.5, s=20, cmap='magma')\n\nw = lr.w/np.linalg.norm(lr.w)\n\ndelta = 0.01\nxmin = -1.25\nxmax = 0.75\nymin = -0.75\nymax = 1.5\nxs = np.arange(xmin, xmax, delta)\nys = np.arange(ymin, ymax, delta)\nx_grid, y_grid = np.meshgrid(xs, ys)\n\nZ = sigmoid(lr.w[0]*x_grid + lr.w[1]*y_grid)\n\nim = plt.imshow(Z, origin='lower',\n cmap=plt.cm.gray, extent=(xmin, xmax, ymin, ymax))\n\nC = plt.contour(x_grid, y_grid, Z, 7,\n origin='lower',\n linewidths=(1, 1, 1, 1, 2, 1, 1),\n colors='black')\n\nplt.clabel(C, inline=1, fmt='%1.2f')\n\nplt.arrow(0, 0, w[0], w[1], width=0.001, head_width=0.05,\n length_includes_head=True, alpha=1, linestyle='-', color='k')\nplt.text(w[0], w[1]+0.05, r\"$\\mathbf{w}$\")\n\n#%%\n\nplt.figure(figsize=(5, 3))\nplt.plot(lr.loss_history)\nplt.ylabel('loss')\nplt.xlabel('epoch')\n\n\n# Make sure to train the algorithm for a sufficiently large number of epochs so that the loss has a chance to converg.\n\n# ### Logistic regression in scikit-learn\n#\n# Let's compare results using our implementation with scikit-learn:\n\n#%%\n\nX, y = load_breast_cancer(return_X_y=True)\n\n# standardize\nX = StandardScaler().fit_transform(X)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y,\n test_size=0.3, shuffle=True, random_state=2)\n\n# first, our implementation\nlr = logistic_regression(0.01, 1000)\n\n# train the model using the training sets\nget_ipython().run_line_magic('timeit', 'lr.fit(X_train, y_train)')", "original_comment": "# making predictions on the testing set\n", "target_code": "y_pred = lr.predict(X_test)\n", "project_metadata": {"full_name": "asabenhur/CS345", "description": "Jupyter", "topics": [], "git_url": "git://github.com/asabenhur/CS345.git", "stars": 4, "watchers": 4, "forks": 11, "created": "2020-08-11T19:32:02Z", "size": 6413, "license": "mit", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 4808835}, "last_updated": "2020-12-30T20:50:00Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 3}, "usefulness": "Agree", "usefulness-score": 2, "coverage": "Agree", "coverage-score": 2, "precision": "Agree", "precision-score": 2, "compatibility": "Strongly agree", "compatibility-score": 3}, {"completed_by": {"id": 2}, "usefulness": "Strongly agree", "usefulness-score": 3, "coverage": "Strongly agree", "coverage-score": 3, "precision": "Strongly agree", "precision-score": 3, "compatibility": "Strongly agree", "compatibility-score": 3}], "predicted_code": "y_pred = lr.predict(X_test)\nprint('accuracy: ', np.sum(y_pred == y_test)/len(y_test))\n", "model": "docstring", "intent": "# making predictions on the testing set"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)\n\n\n\nDATE = '04202020'\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n#%%\n\nimport inStrain.SNVprofile\nimport inStrain\nimport os\nimport sys\nimport glob\nimport scipy\nimport sklearn\nimport matplotlib\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nfrom collections import defaultdict\nfrom matplotlib import pyplot as plt\nfrom matplotlib.backends.backend_pdf import PdfPages\n\nget_ipython().run_line_magic('matplotlib', 'inline')\nsns.set_style('whitegrid')\npd.set_option('display.max_rows', 100)\nmatplotlib.rcParams['ps.fonttype'] = 42\nmatplotlib.rcParams['pdf.fonttype'] = 42\npd.set_option('display.max_columns', 100)", "original_comment": "# ## Read in information\n", "target_code": "PLdb = pd.read_csv(\n '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))\n", "project_metadata": {"full_name": "MrOlm/covid19_population_genomics", "description": "Analysis of the population diversity of SARS-CoV-2 within and between individual patients", "topics": [], "git_url": "git://github.com/MrOlm/covid19_population_genomics.git", "stars": 9, "watchers": 9, "forks": 1, "created": "2020-03-20T16:01:19Z", "size": 170583, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 40959012, "Python": 1028}, "last_updated": "2020-12-05T12:24:09Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "precision": "Strongly disagree", "precision-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Strongly disagree", "compatibility-score": 0}, {"completed_by": {"id": 2}, "usefulness": "Disagree", "usefulness-score": 1, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Agree", "precision-score": 2, "compatibility": "Disagree", "compatibility-score": 1}], "predicted_code": "inFile = '/home/cmb-panasas2/skchoudh/genomes/hg38/Homo_sapiens.GRCh38.fna.gz'\n", "model": "natural", "intent": "# Read in information"}, {"context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n\nred.head()\n\n\nwhite.head()\n\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\ndef frequencyDists(wine_set):\n print(\"This is the frequency distribution of the wines' quality.\")\n", "original_context": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Wine Quality Dataset\n\n# ## Objective: Predict the quality of wine\n\n#%%\n\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import classification_report\nfrom sklearn.naive_bayes import BernoulliNB\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.decomposition import PCA\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import KMeans\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.linear_model import LassoLarsCV\nfrom sklearn import preprocessing\nimport operator\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nimport sklearn\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\nimport time\nfrom sklearn import svm\nfrom sklearn.grid_search import GridSearchCV\nfrom sklearn.neighbors import KNeighborsClassifier\nimport pandas as pd\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.multicomp as multi\nimport scipy.stats\nimport numpy as np\nimport seaborn\nimport matplotlib.pyplot as plt\nimport warnings\nwarnings.filterwarnings('ignore')\nget_ipython().run_line_magic('matplotlib', 'inline')\n\n\n# ## Data Analysis\n\n#%%\n\nred = pd.read_csv('winequality-red.csv', low_memory=False, sep=';')\nwhite = pd.read_csv('winequality-white.csv', low_memory=False, sep=';')\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Function to select red or white dataset\ndef call(functionToCall):\n print('\\nRed Wine\\n')\n functionToCall(red)\n print('\\nWhite Wine\\n')\n functionToCall(white)\n\n#%%\n\n# Remove spaces from column names\ndef rm(wine_set):\n wine_set.columns = [x.strip().replace(' ', '_') for x in wine_set.columns]\n\n\ncall(rm)\n\n#%%\n\nred.head()\n\n#%%\n\nwhite.head()\n\n#%%\n\n# Covarience matrix\ndef covmax(wine_set):\n cov_mat = wine_set.corr(method='pearson')\n fig = plt.figure().add_subplot(111)\n plt.pcolor(cov_mat, cmap='RdBu')\n plt.colorbar()\n fig.set_xticklabels(wine_set.columns)\n fig.set_yticklabels(wine_set.columns)\n plt.show()\n\n\ncall(covmax)\n\n#%%\n\n# Add a column 'quality_mark'\ndef add_categ_quality(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n low['quality_mark'] = 'low'\n medium['quality_mark'] = 'medium'\n high['quality_mark'] = 'high'\n\n frames = [low, medium, high]\n return pd.concat(frames)\n\n\n# ## Hypothesis Testing and ANOVA\n\n#%%\n\n# Calculating the F-statistics and associated p-value\ndef anova(wine_set):\n prepared_data = add_categ_quality(wine_set)\n model1 = smf.ols(\n formula='total_sulfur_dioxide ~ C(quality_mark)', data=prepared_data)\n results1 = model1.fit()\n print(results1.summary())\n\n sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']]\n print(\"\\nMeans for total sulfur dioxide by quality marks of wine \\n\")\n print(sub.groupby('quality_mark').mean())\n print('\\nStandard deviation for total sulfur dioxide by quality marks of wine \\n')\n print(sub.groupby('quality_mark').std(), '\\n')\n\n # Perform Post hoc test\n mc1 = multi.MultiComparison(\n sub['total_sulfur_dioxide'], sub['quality_mark'])\n res1 = mc1.tukeyhsd()\n print(res1.summary())\n\n\ncall(anova)\n\n#%%\n\n# Pearson Correlation\ndef pearson(wine_set):\n scat1 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=wine_set)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\"Association between wine's density and residual sugar \\n\")\n plt.show()\n\n print(scipy.stats.pearsonr(\n wine_set['density'], wine_set[\"residual_sugar\"]))\n\n\ncall(pearson)\n\n#%%\n\n# Exploring Statistical Interactions\ndef explore(wine_set):\n low = wine_set[wine_set['quality'] <= 5]\n medium = wine_set[(wine_set['quality'] == 6) | (wine_set['quality'] == 7)]\n high = wine_set[wine_set['quality'] > 7]\n\n print('association between wine`s density and residual sugar for wines \\nof `low` quality')\n print(scipy.stats.pearsonr(low['density'], low[\"residual_sugar\"]))\n print('\\nof `medium` quality')\n print(scipy.stats.pearsonr(medium['density'], medium[\"residual_sugar\"]))\n print('\\nof `high` quality')\n print(scipy.stats.pearsonr(high['density'], high[\"residual_sugar\"]))\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=low)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `low` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=medium)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `medium` quality\")\n plt.show()\n\n scat0 = seaborn.regplot(\n x=\"density\", y=\"residual_sugar\", fit_reg=True, data=high)\n plt.xlabel(\"Density of wine\")\n plt.ylabel(\"Residual sugar in wine, gram\")\n plt.title(\n \"Association between wine's density and residual sugar for wines of `high` quality\\n\")\n plt.show()\n\n\ncall(explore)\n\n#%%\n\ndef basicInfo(wine_set):\n print(len(wine_set))\n print(len(wine_set.columns))\n print(list(wine_set.columns.values))\n print(wine_set.ix[:10, :4])\n print('\\n')\n print(\"--------------describe the data-----------------\")\n print('\\n')\n print(wine_set.describe())\n\n\ncall(basicInfo)\n\n\n# ## Data Exploration\n\n#%%", "original_comment": "# print frequency distributions of wines' quality\n", "target_code": " print(wine_set.groupby(\"quality\").size()*100 / len(wine_set))\n", "project_metadata": {"full_name": "shrikant-temburwar/Wine-Quality-Dataset", "description": null, "topics": [], "git_url": "git://github.com/shrikant-temburwar/Wine-Quality-Dataset.git", "stars": 7, "watchers": 7, "forks": 13, "created": "2018-06-11T14:03:02Z", "size": 575, "license": "", "language": "Jupyter Notebook", "languages": {"Jupyter Notebook": 670078}, "last_updated": "2020-12-16T12:41:33Z"}, "annotations": [{"completed_by": {"id": 1}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 3}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}, {"completed_by": {"id": 2}, "usefulness": "Strongly disagree", "usefulness-score": 0, "coverage": "Strongly disagree", "coverage-score": 0, "precision": "Strongly disagree", "precision-score": 0, "compatibility": "Agree", "compatibility-score": 2}], "predicted_code": "frequencyDists(wine_set)\nfrequencyDists(wine_set)\n", "model": "docstring", "intent": " # print frequency distributions of wines' quality"}]